We begin by loading the required libraries and the dataset
flight_exploration <- read.csv("Train.csv", header = TRUE)
str(flight_exploration)
## 'data.frame': 103904 obs. of 25 variables:
## $ X : int 0 1 2 3 4 5 6 7 8 9 ...
## $ id : int 70172 5047 110028 24026 119299 111157 82113 96462 79485 65725 ...
## $ Gender : chr "Male" "Male" "Female" "Female" ...
## $ Customer.Type : chr "Loyal Customer" "disloyal Customer" "Loyal Customer" "Loyal Customer" ...
## $ Age : int 13 25 26 25 61 26 47 52 41 20 ...
## $ Type.of.Travel : chr "Personal Travel" "Business travel" "Business travel" "Business travel" ...
## $ Class : chr "Eco Plus" "Business" "Business" "Business" ...
## $ Flight.Distance : int 460 235 1142 562 214 1180 1276 2035 853 1061 ...
## $ Inflight.wifi.service : int 3 3 2 2 3 3 2 4 1 3 ...
## $ Departure.Arrival.time.convenient: int 4 2 2 5 3 4 4 3 2 3 ...
## $ Ease.of.Online.booking : int 3 3 2 5 3 2 2 4 2 3 ...
## $ Gate.location : int 1 3 2 5 3 1 3 4 2 4 ...
## $ Food.and.drink : int 5 1 5 2 4 1 2 5 4 2 ...
## $ Online.boarding : int 3 3 5 2 5 2 2 5 3 3 ...
## $ Seat.comfort : int 5 1 5 2 5 1 2 5 3 3 ...
## $ Inflight.entertainment : int 5 1 5 2 3 1 2 5 1 2 ...
## $ On.board.service : int 4 1 4 2 3 3 3 5 1 2 ...
## $ Leg.room.service : int 3 5 3 5 4 4 3 5 2 3 ...
## $ Baggage.handling : int 4 3 4 3 4 4 4 5 1 4 ...
## $ Checkin.service : int 4 1 4 1 3 4 3 4 4 4 ...
## $ Inflight.service : int 5 4 4 4 3 4 5 5 1 3 ...
## $ Cleanliness : int 5 1 5 2 3 1 2 4 2 2 ...
## $ Departure.Delay.in.Minutes : int 25 1 0 11 0 0 9 4 0 0 ...
## $ Arrival.Delay.in.Minutes : num 18 6 0 9 0 0 23 0 0 0 ...
## $ satisfaction : chr "neutral or dissatisfied" "neutral or dissatisfied" "satisfied" "neutral or dissatisfied" ...
The services are grouped in three categories- inflight serives, airport services, and online services.
#Classifying the experience into Airport,Inflight and online
Group <- rename(flight_exploration, Inflight1 = 'Seat.comfort',
Airport1 = 'Departure.Arrival.time.convenient',
Inflight2 ='Food.and.drink',
Airport2 = 'Gate.location',
Inflight3 = 'Inflight.wifi.service',
Inflight4 = 'Inflight.entertainment',
Online1 = 'Ease.of.Online.booking',
Inflight5 = 'On.board.service',
Inflight6 = 'Leg.room.service',
Airport3 = 'Baggage.handling',
Airport4 = 'Checkin.service',
Inflight7 = Cleanliness,
Online3 = 'Online.boarding')
#Output of the grouping
Group <- Group %>%
mutate(InflightExperience =rowMeans(Group %>% select(starts_with("Inflight"))),
AirportExperience=rowMeans(Group %>% select(starts_with("Airport"))),
OnlineExperience=rowMeans(Group %>% select(starts_with("Online"))),
male=ifelse(Gender == 1,0,1))
glimpse(Group)
## Rows: 103,904
## Columns: 29
## $ X <int> 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 1…
## $ id <int> 70172, 5047, 110028, 24026, 119299, 111157,…
## $ Gender <chr> "Male", "Male", "Female", "Female", "Male",…
## $ Customer.Type <chr> "Loyal Customer", "disloyal Customer", "Loy…
## $ Age <int> 13, 25, 26, 25, 61, 26, 47, 52, 41, 20, 24,…
## $ Type.of.Travel <chr> "Personal Travel", "Business travel", "Busi…
## $ Class <chr> "Eco Plus", "Business", "Business", "Busine…
## $ Flight.Distance <int> 460, 235, 1142, 562, 214, 1180, 1276, 2035,…
## $ Inflight3 <int> 3, 3, 2, 2, 3, 3, 2, 4, 1, 3, 4, 2, 1, 4, 3…
## $ Airport1 <int> 4, 2, 2, 5, 3, 4, 4, 3, 2, 3, 5, 4, 4, 2, 2…
## $ Online1 <int> 3, 3, 2, 5, 3, 2, 2, 4, 2, 3, 5, 2, 4, 4, 3…
## $ Airport2 <int> 1, 3, 2, 5, 3, 1, 3, 4, 2, 4, 4, 2, 4, 3, 2…
## $ Inflight2 <int> 5, 1, 5, 2, 4, 1, 2, 5, 4, 2, 2, 1, 1, 4, 2…
## $ Online3 <int> 3, 3, 5, 2, 5, 2, 2, 5, 3, 3, 5, 2, 1, 4, 3…
## $ Inflight1 <int> 5, 1, 5, 2, 5, 1, 2, 5, 3, 3, 2, 1, 1, 4, 2…
## $ Inflight4 <int> 5, 1, 5, 2, 3, 1, 2, 5, 1, 2, 2, 1, 1, 4, 2…
## $ Inflight5 <int> 4, 1, 4, 2, 3, 3, 3, 5, 1, 2, 3, 1, 1, 4, 4…
## $ Inflight6 <int> 3, 5, 3, 5, 4, 4, 3, 5, 2, 3, 3, 2, 1, 5, 3…
## $ Airport3 <int> 4, 3, 4, 3, 4, 4, 4, 5, 1, 4, 5, 5, 3, 2, 2…
## $ Airport4 <int> 4, 1, 4, 1, 3, 4, 3, 4, 4, 4, 3, 5, 4, 2, 2…
## $ Inflight.service <int> 5, 4, 4, 4, 3, 4, 5, 5, 1, 3, 5, 5, 4, 2, 1…
## $ Inflight7 <int> 5, 1, 5, 2, 3, 1, 2, 4, 2, 2, 2, 1, 1, 4, 2…
## $ Departure.Delay.in.Minutes <int> 25, 1, 0, 11, 0, 0, 9, 4, 0, 0, 0, 0, 28, 0…
## $ Arrival.Delay.in.Minutes <dbl> 18, 6, 0, 9, 0, 0, 23, 0, 0, 0, 0, 0, 8, 0,…
## $ satisfaction <chr> "neutral or dissatisfied", "neutral or diss…
## $ InflightExperience <dbl> 4.375, 2.125, 4.125, 2.625, 3.500, 2.250, 2…
## $ AirportExperience <dbl> 3.25, 2.25, 3.00, 3.50, 3.25, 3.25, 3.50, 4…
## $ OnlineExperience <dbl> 3.0, 3.0, 3.5, 3.5, 4.0, 2.0, 2.0, 4.5, 2.5…
## $ male <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
# Plot of inflight Experience
ggplot(Group) +
aes(x = InflightExperience) +
geom_histogram(aes(y = ..density..)) +
geom_density()+
labs(title = "Histogram of Inflight Experience Satisfaction Scores")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#Plot of Airport experience
ggplot(Group) +
aes(x = AirportExperience) +
geom_histogram(aes(y = ..density..)) +
geom_density()+
labs(title = "Histogram of Airport Experience Satisfaction Scores")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#Plot of Online experience
ggplot(Group) +
aes(x = OnlineExperience) +
geom_histogram(aes(y = ..density..)) +
geom_density()+
labs(title = "Histogram of Online Experience Satisfaction Scores")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Gathering and renaming for better visualization
sat <- gather(flight_exploration, 'Seat.comfort', 'Departure.Arrival.time.convenient', 'Food.and.drink', 'Gate.location', 'Inflight.wifi.service', 'Inflight.entertainment', 'Ease.of.Online.booking', 'On.board.service', 'Leg.room.service','Baggage.handling', 'Checkin.service', Cleanliness, 'Online.boarding', key = "Criteria", value = "Satisfaction_Scale")
sat$Criteria[sat$Criteria=="Seat.comfort"] <- "A1"
sat$Criteria[sat$Criteria=="Departure.Arrival.time.convenient"] <- "A2"
sat$Criteria[sat$Criteria=="Food.and.drink"] <- "A3"
sat$Criteria[sat$Criteria=="Gate.location"] <- "A4"
sat$Criteria[sat$Criteria=="Inflight.wifi.service"] <- "A5"
sat$Criteria[sat$Criteria=="Inflight.entertainment"] <- "A6"
sat$Criteria[sat$Criteria=="Ease.of.Online.booking"] <- "A7"
sat$Criteria[sat$Criteria=="On.board.service"] <- "A8"
sat$Criteria[sat$Criteria=="Leg.room.service"] <- "A9"
sat$Criteria[sat$Criteria=="Baggage.handling"] <- "A10"
sat$Criteria[sat$Criteria=="Checkin.service"] <- "A11"
sat$Criteria[sat$Criteria=="Cleanliness"] <- "A12"
sat$Criteria[sat$Criteria=="Online.boarding"] <- "A13"
sat$Criteria <- as.factor(sat$Criteria)
glimpse(sat)
## Rows: 1,350,752
## Columns: 14
## $ X <int> 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 1…
## $ id <int> 70172, 5047, 110028, 24026, 119299, 111157,…
## $ Gender <chr> "Male", "Male", "Female", "Female", "Male",…
## $ Customer.Type <chr> "Loyal Customer", "disloyal Customer", "Loy…
## $ Age <int> 13, 25, 26, 25, 61, 26, 47, 52, 41, 20, 24,…
## $ Type.of.Travel <chr> "Personal Travel", "Business travel", "Busi…
## $ Class <chr> "Eco Plus", "Business", "Business", "Busine…
## $ Flight.Distance <int> 460, 235, 1142, 562, 214, 1180, 1276, 2035,…
## $ Inflight.service <int> 5, 4, 4, 4, 3, 4, 5, 5, 1, 3, 5, 5, 4, 2, 1…
## $ Departure.Delay.in.Minutes <int> 25, 1, 0, 11, 0, 0, 9, 4, 0, 0, 0, 0, 28, 0…
## $ Arrival.Delay.in.Minutes <dbl> 18, 6, 0, 9, 0, 0, 23, 0, 0, 0, 0, 0, 8, 0,…
## $ satisfaction <chr> "neutral or dissatisfied", "neutral or diss…
## $ Criteria <fct> A1, A1, A1, A1, A1, A1, A1, A1, A1, A1, A1,…
## $ Satisfaction_Scale <int> 5, 1, 5, 2, 5, 1, 2, 5, 3, 3, 2, 1, 1, 4, 2…
Now, a box plot is plotted to check the services which had lower scores than the average scores
sat %>%
mutate(class = fct_reorder(Criteria, Satisfaction_Scale, .fun = 'mean')) %>%
ggplot(aes(x=reorder(Criteria, Satisfaction_Scale), y = Satisfaction_Scale, fill = Criteria)) +
geom_boxplot()+
stat_summary(fun.y = "mean", geom = "point", shape = 10, size = 3,fill = "Yellow") +
geom_hline(aes(yintercept = mean(Satisfaction_Scale)), linetype="dashed",color = "Orange", size = 1.2)+
scale_fill_brewer() +
labs(title = "Boxplot illustrating the Mean of Satisfaction Level",
caption = "BoxPlot",
x = "Satisfaction Criteria",
y = "Satisfaction Scale")+
theme(legend.position = "none")
## Warning: `fun.y` is deprecated. Use `fun` instead.
## Warning in RColorBrewer::brewer.pal(n, pal): n too large, allowed maximum for palette Blues is 9
## Returning the palette you asked for with that many colors
# inflight wifi
ggplot(flight_exploration,
aes(x = satisfaction,
y = Inflight.wifi.service)) +
geom_boxplot()
# departure arrival time convenient
ggplot(flight_exploration,
aes(x = satisfaction,
y = Departure.Arrival.time.convenient)) +
geom_boxplot()
# ease of online booking
ggplot(flight_exploration,
aes(x = satisfaction,
y = Ease.of.Online.booking)) +
geom_boxplot()
# gate location
ggplot(flight_exploration,
aes(x = satisfaction,
y = Gate.location)) +
geom_boxplot()
# food and drink
ggplot(flight_exploration,
aes(x = satisfaction,
y = Food.and.drink)) +
geom_boxplot()
# online boarding
ggplot(flight_exploration,
aes(x = satisfaction,
y = Online.boarding)) +
geom_boxplot()
# seat comfort
ggplot(flight_exploration,
aes(x = satisfaction,
y = Seat.comfort)) +
geom_boxplot()
# inflight entertainment
ggplot(flight_exploration,
aes(x = satisfaction,
y = Inflight.entertainment)) +
geom_boxplot()
# on board services
ggplot(flight_exploration,
aes(x = satisfaction,
y = On.board.service)) +
geom_boxplot()
# leg room
ggplot(flight_exploration,
aes(x = satisfaction,
y = Leg.room.service)) +
geom_boxplot()
# baggage handling
ggplot(flight_exploration,
aes(x = satisfaction,
y = Baggage.handling)) +
geom_boxplot()
# checkin service
ggplot(flight_exploration,
aes(x = satisfaction,
y = Checkin.service)) +
geom_boxplot()
# inflight service
ggplot(flight_exploration,
aes(x = satisfaction,
y = Inflight.service)) +
geom_boxplot()
# cleanliness
ggplot(flight_exploration,
aes(x = satisfaction,
y = Cleanliness)) +
geom_boxplot()
We load the necessary libraries and store the flight survey dataset as dataframe
flight_survey <- flight_exploration
str(flight_survey)
## 'data.frame': 103904 obs. of 25 variables:
## $ X : int 0 1 2 3 4 5 6 7 8 9 ...
## $ id : int 70172 5047 110028 24026 119299 111157 82113 96462 79485 65725 ...
## $ Gender : chr "Male" "Male" "Female" "Female" ...
## $ Customer.Type : chr "Loyal Customer" "disloyal Customer" "Loyal Customer" "Loyal Customer" ...
## $ Age : int 13 25 26 25 61 26 47 52 41 20 ...
## $ Type.of.Travel : chr "Personal Travel" "Business travel" "Business travel" "Business travel" ...
## $ Class : chr "Eco Plus" "Business" "Business" "Business" ...
## $ Flight.Distance : int 460 235 1142 562 214 1180 1276 2035 853 1061 ...
## $ Inflight.wifi.service : int 3 3 2 2 3 3 2 4 1 3 ...
## $ Departure.Arrival.time.convenient: int 4 2 2 5 3 4 4 3 2 3 ...
## $ Ease.of.Online.booking : int 3 3 2 5 3 2 2 4 2 3 ...
## $ Gate.location : int 1 3 2 5 3 1 3 4 2 4 ...
## $ Food.and.drink : int 5 1 5 2 4 1 2 5 4 2 ...
## $ Online.boarding : int 3 3 5 2 5 2 2 5 3 3 ...
## $ Seat.comfort : int 5 1 5 2 5 1 2 5 3 3 ...
## $ Inflight.entertainment : int 5 1 5 2 3 1 2 5 1 2 ...
## $ On.board.service : int 4 1 4 2 3 3 3 5 1 2 ...
## $ Leg.room.service : int 3 5 3 5 4 4 3 5 2 3 ...
## $ Baggage.handling : int 4 3 4 3 4 4 4 5 1 4 ...
## $ Checkin.service : int 4 1 4 1 3 4 3 4 4 4 ...
## $ Inflight.service : int 5 4 4 4 3 4 5 5 1 3 ...
## $ Cleanliness : int 5 1 5 2 3 1 2 4 2 2 ...
## $ Departure.Delay.in.Minutes : int 25 1 0 11 0 0 9 4 0 0 ...
## $ Arrival.Delay.in.Minutes : num 18 6 0 9 0 0 23 0 0 0 ...
## $ satisfaction : chr "neutral or dissatisfied" "neutral or dissatisfied" "satisfied" "neutral or dissatisfied" ...
summary(flight_survey)
## X id Gender Customer.Type
## Min. : 0 Min. : 1 Length:103904 Length:103904
## 1st Qu.: 25976 1st Qu.: 32534 Class :character Class :character
## Median : 51952 Median : 64856 Mode :character Mode :character
## Mean : 51952 Mean : 64924
## 3rd Qu.: 77927 3rd Qu.: 97368
## Max. :103903 Max. :129880
##
## Age Type.of.Travel Class Flight.Distance
## Min. : 7.00 Length:103904 Length:103904 Min. : 31
## 1st Qu.:27.00 Class :character Class :character 1st Qu.: 414
## Median :40.00 Mode :character Mode :character Median : 843
## Mean :39.38 Mean :1189
## 3rd Qu.:51.00 3rd Qu.:1743
## Max. :85.00 Max. :4983
##
## Inflight.wifi.service Departure.Arrival.time.convenient Ease.of.Online.booking
## Min. :0.00 Min. :0.00 Min. :0.000
## 1st Qu.:2.00 1st Qu.:2.00 1st Qu.:2.000
## Median :3.00 Median :3.00 Median :3.000
## Mean :2.73 Mean :3.06 Mean :2.757
## 3rd Qu.:4.00 3rd Qu.:4.00 3rd Qu.:4.000
## Max. :5.00 Max. :5.00 Max. :5.000
##
## Gate.location Food.and.drink Online.boarding Seat.comfort
## Min. :0.000 Min. :0.000 Min. :0.00 Min. :0.000
## 1st Qu.:2.000 1st Qu.:2.000 1st Qu.:2.00 1st Qu.:2.000
## Median :3.000 Median :3.000 Median :3.00 Median :4.000
## Mean :2.977 Mean :3.202 Mean :3.25 Mean :3.439
## 3rd Qu.:4.000 3rd Qu.:4.000 3rd Qu.:4.00 3rd Qu.:5.000
## Max. :5.000 Max. :5.000 Max. :5.00 Max. :5.000
##
## Inflight.entertainment On.board.service Leg.room.service Baggage.handling
## Min. :0.000 Min. :0.000 Min. :0.000 Min. :1.000
## 1st Qu.:2.000 1st Qu.:2.000 1st Qu.:2.000 1st Qu.:3.000
## Median :4.000 Median :4.000 Median :4.000 Median :4.000
## Mean :3.358 Mean :3.382 Mean :3.351 Mean :3.632
## 3rd Qu.:4.000 3rd Qu.:4.000 3rd Qu.:4.000 3rd Qu.:5.000
## Max. :5.000 Max. :5.000 Max. :5.000 Max. :5.000
##
## Checkin.service Inflight.service Cleanliness Departure.Delay.in.Minutes
## Min. :0.000 Min. :0.00 Min. :0.000 Min. : 0.00
## 1st Qu.:3.000 1st Qu.:3.00 1st Qu.:2.000 1st Qu.: 0.00
## Median :3.000 Median :4.00 Median :3.000 Median : 0.00
## Mean :3.304 Mean :3.64 Mean :3.286 Mean : 14.82
## 3rd Qu.:4.000 3rd Qu.:5.00 3rd Qu.:4.000 3rd Qu.: 12.00
## Max. :5.000 Max. :5.00 Max. :5.000 Max. :1592.00
##
## Arrival.Delay.in.Minutes satisfaction
## Min. : 0.00 Length:103904
## 1st Qu.: 0.00 Class :character
## Median : 0.00 Mode :character
## Mean : 15.18
## 3rd Qu.: 13.00
## Max. :1584.00
## NA's :310
Here we remove the id columns, change the type of the variables, and removing the outliers
# removing X and id
flight_survey <- flight_survey[, -c(1, 2)]
# changing the characters to factors
flight_survey$Gender <- as.factor(flight_survey$Gender)
flight_survey$Customer.Type <-as.factor(flight_survey$Customer.Type)
flight_survey$Type.of.Travel<-as.factor(flight_survey$Type.of.Travel)
flight_survey$Class<-as.factor(flight_survey$Class)
flight_survey$satisfaction<-as.factor(flight_survey$satisfaction)
# changing the labels
levels(flight_survey$satisfaction) <- c("dissatisfied", "satisfied")
# removing outliers
iqr <- IQR(flight_survey$Departure.Delay.in.Minutes) #removing outliers in departure delays
Q <- quantile(flight_survey$Departure.Delay.in.Minutes, probs=c(.25, .75), na.rm = FALSE)
flight_survey<- subset(flight_survey, flight_survey$Departure.Delay.in.Minutes > (Q[1] - 2.5*iqr) & flight_survey$Departure.Delay.in.Minutes < (Q[2]+2.5*iqr))
iqr2 <- IQR(flight_survey$Arrival.Delay.in.Minutes, na.rm = TRUE) #removing outliers in arrival delays
Q1 <- quantile(flight_survey$Arrival.Delay.in.Minutes, probs=c(.25, .75), na.rm = TRUE)
flight_survey<- subset(flight_survey, flight_survey$Arrival.Delay.in.Minutes > (Q1[1] - 2.5*iqr2) & flight_survey$Arrival.Delay.in.Minutes < (Q1[2]+2.5*iqr2))
str(flight_survey)
## 'data.frame': 83620 obs. of 23 variables:
## $ Gender : Factor w/ 2 levels "Female","Male": 2 2 1 1 2 1 1 1 2 1 ...
## $ Customer.Type : Factor w/ 2 levels "disloyal Customer",..: 2 1 2 2 2 2 2 2 1 1 ...
## $ Age : int 13 25 26 25 61 26 52 41 20 24 ...
## $ Type.of.Travel : Factor w/ 2 levels "Business travel",..: 2 1 1 1 1 2 1 1 1 1 ...
## $ Class : Factor w/ 3 levels "Business","Eco",..: 3 1 1 1 1 2 1 1 2 2 ...
## $ Flight.Distance : int 460 235 1142 562 214 1180 2035 853 1061 1182 ...
## $ Inflight.wifi.service : int 3 3 2 2 3 3 4 1 3 4 ...
## $ Departure.Arrival.time.convenient: int 4 2 2 5 3 4 3 2 3 5 ...
## $ Ease.of.Online.booking : int 3 3 2 5 3 2 4 2 3 5 ...
## $ Gate.location : int 1 3 2 5 3 1 4 2 4 4 ...
## $ Food.and.drink : int 5 1 5 2 4 1 5 4 2 2 ...
## $ Online.boarding : int 3 3 5 2 5 2 5 3 3 5 ...
## $ Seat.comfort : int 5 1 5 2 5 1 5 3 3 2 ...
## $ Inflight.entertainment : int 5 1 5 2 3 1 5 1 2 2 ...
## $ On.board.service : int 4 1 4 2 3 3 5 1 2 3 ...
## $ Leg.room.service : int 3 5 3 5 4 4 5 2 3 3 ...
## $ Baggage.handling : int 4 3 4 3 4 4 5 1 4 5 ...
## $ Checkin.service : int 4 1 4 1 3 4 4 4 4 3 ...
## $ Inflight.service : int 5 4 4 4 3 4 5 1 3 5 ...
## $ Cleanliness : int 5 1 5 2 3 1 4 2 2 2 ...
## $ Departure.Delay.in.Minutes : int 25 1 0 11 0 0 4 0 0 0 ...
## $ Arrival.Delay.in.Minutes : num 18 6 0 9 0 0 0 0 0 0 ...
## $ satisfaction : Factor w/ 2 levels "dissatisfied",..: 1 1 2 1 2 1 2 1 1 1 ...
# removing the missing value
colSums(is.na(flight_survey))
## Gender Customer.Type
## 0 0
## Age Type.of.Travel
## 0 0
## Class Flight.Distance
## 0 0
## Inflight.wifi.service Departure.Arrival.time.convenient
## 0 0
## Ease.of.Online.booking Gate.location
## 0 0
## Food.and.drink Online.boarding
## 0 0
## Seat.comfort Inflight.entertainment
## 0 0
## On.board.service Leg.room.service
## 0 0
## Baggage.handling Checkin.service
## 0 0
## Inflight.service Cleanliness
## 0 0
## Departure.Delay.in.Minutes Arrival.Delay.in.Minutes
## 0 0
## satisfaction
## 0
flight_survey<- na.omit(flight_survey)
mdl1 <- glm(satisfaction~.-Departure.Delay.in.Minutes , data=flight_survey, family="binomial")
options(scipen=999)
summary(mdl1)
##
## Call:
## glm(formula = satisfaction ~ . - Departure.Delay.in.Minutes,
## family = "binomial", data = flight_survey)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.7617 -0.5037 -0.1701 0.3960 3.9350
##
## Coefficients:
## Estimate Std. Error z value
## (Intercept) -7.48464535 0.08709011 -85.941
## GenderMale 0.04862715 0.02149084 2.263
## Customer.TypeLoyal Customer 2.09080861 0.03315771 63.056
## Age -0.00961006 0.00077830 -12.347
## Type.of.TravelPersonal Travel -2.67385210 0.03417067 -78.250
## ClassEco -0.71976384 0.02846077 -25.290
## ClassEco Plus -0.77550378 0.04563738 -16.993
## Flight.Distance -0.00001733 0.00001264 -1.371
## Inflight.wifi.service 0.42483003 0.01288161 32.980
## Departure.Arrival.time.convenient -0.12386715 0.00885696 -13.985
## Ease.of.Online.booking -0.18162891 0.01263668 -14.373
## Gate.location 0.03870844 0.00995289 3.889
## Food.and.drink -0.05826911 0.01209818 -4.816
## Online.boarding 0.59979578 0.01125019 53.314
## Seat.comfort 0.08437149 0.01222310 6.903
## Inflight.entertainment 0.03514923 0.01638286 2.145
## On.board.service 0.31410850 0.01137476 27.615
## Leg.room.service 0.25728844 0.00933544 27.560
## Baggage.handling 0.12367191 0.01255443 9.851
## Checkin.service 0.30743826 0.00937486 32.794
## Inflight.service 0.09777242 0.01339472 7.299
## Cleanliness 0.24029470 0.01346982 17.839
## Arrival.Delay.in.Minutes -0.05026857 0.00223788 -22.463
## Pr(>|z|)
## (Intercept) < 0.0000000000000002 ***
## GenderMale 0.023655 *
## Customer.TypeLoyal Customer < 0.0000000000000002 ***
## Age < 0.0000000000000002 ***
## Type.of.TravelPersonal Travel < 0.0000000000000002 ***
## ClassEco < 0.0000000000000002 ***
## ClassEco Plus < 0.0000000000000002 ***
## Flight.Distance 0.170325
## Inflight.wifi.service < 0.0000000000000002 ***
## Departure.Arrival.time.convenient < 0.0000000000000002 ***
## Ease.of.Online.booking < 0.0000000000000002 ***
## Gate.location 0.000101 ***
## Food.and.drink 0.000001462064512 ***
## Online.boarding < 0.0000000000000002 ***
## Seat.comfort 0.000000000005105 ***
## Inflight.entertainment 0.031914 *
## On.board.service < 0.0000000000000002 ***
## Leg.room.service < 0.0000000000000002 ***
## Baggage.handling < 0.0000000000000002 ***
## Checkin.service < 0.0000000000000002 ***
## Inflight.service 0.000000000000289 ***
## Cleanliness < 0.0000000000000002 ***
## Arrival.Delay.in.Minutes < 0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 115147 on 83619 degrees of freedom
## Residual deviance: 56852 on 83597 degrees of freedom
## AIC: 56898
##
## Number of Fisher Scoring iterations: 5
Using step regression to
library(caret)
#training control for cross validation
tr <- trainControl(method="cv", number=10)
#using feature selection
mdl2 <- step(mdl1, direction="both", trainControl=tr)
## Start: AIC=56898.29
## satisfaction ~ (Gender + Customer.Type + Age + Type.of.Travel +
## Class + Flight.Distance + Inflight.wifi.service + Departure.Arrival.time.convenient +
## Ease.of.Online.booking + Gate.location + Food.and.drink +
## Online.boarding + Seat.comfort + Inflight.entertainment +
## On.board.service + Leg.room.service + Baggage.handling +
## Checkin.service + Inflight.service + Cleanliness + Departure.Delay.in.Minutes +
## Arrival.Delay.in.Minutes) - Departure.Delay.in.Minutes
##
## Df Deviance AIC
## - Flight.Distance 1 56854 56898
## <none> 56852 56898
## - Inflight.entertainment 1 56857 56901
## - Gender 1 56857 56901
## - Gate.location 1 56867 56911
## - Food.and.drink 1 56876 56920
## - Seat.comfort 1 56900 56944
## - Inflight.service 1 56906 56950
## - Baggage.handling 1 56950 56994
## - Age 1 57006 57050
## - Departure.Arrival.time.convenient 1 57047 57091
## - Ease.of.Online.booking 1 57061 57105
## - Cleanliness 1 57172 57216
## - Arrival.Delay.in.Minutes 1 57371 57415
## - Class 2 57552 57594
## - Leg.room.service 1 57617 57661
## - On.board.service 1 57631 57675
## - Checkin.service 1 57961 58005
## - Inflight.wifi.service 1 57980 58024
## - Online.boarding 1 59868 59912
## - Customer.Type 1 61229 61273
## - Type.of.Travel 1 63961 64005
##
## Step: AIC=56898.17
## satisfaction ~ Gender + Customer.Type + Age + Type.of.Travel +
## Class + Inflight.wifi.service + Departure.Arrival.time.convenient +
## Ease.of.Online.booking + Gate.location + Food.and.drink +
## Online.boarding + Seat.comfort + Inflight.entertainment +
## On.board.service + Leg.room.service + Baggage.handling +
## Checkin.service + Inflight.service + Cleanliness + Arrival.Delay.in.Minutes
##
## Df Deviance AIC
## <none> 56854 56898
## + Flight.Distance 1 56852 56898
## - Inflight.entertainment 1 56859 56901
## - Gender 1 56859 56901
## - Gate.location 1 56869 56911
## - Food.and.drink 1 56877 56919
## - Seat.comfort 1 56901 56943
## - Inflight.service 1 56908 56950
## - Baggage.handling 1 56952 56994
## - Age 1 57006 57048
## - Departure.Arrival.time.convenient 1 57050 57092
## - Ease.of.Online.booking 1 57063 57105
## - Cleanliness 1 57174 57216
## - Arrival.Delay.in.Minutes 1 57373 57415
## - Class 2 57614 57654
## - Leg.room.service 1 57617 57659
## - On.board.service 1 57632 57674
## - Checkin.service 1 57963 58005
## - Inflight.wifi.service 1 57990 58032
## - Online.boarding 1 59868 59910
## - Customer.Type 1 61505 61547
## - Type.of.Travel 1 64053 64095
summary(mdl2)
##
## Call:
## glm(formula = satisfaction ~ Gender + Customer.Type + Age + Type.of.Travel +
## Class + Inflight.wifi.service + Departure.Arrival.time.convenient +
## Ease.of.Online.booking + Gate.location + Food.and.drink +
## Online.boarding + Seat.comfort + Inflight.entertainment +
## On.board.service + Leg.room.service + Baggage.handling +
## Checkin.service + Inflight.service + Cleanliness + Arrival.Delay.in.Minutes,
## family = "binomial", data = flight_survey)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.7671 -0.5036 -0.1704 0.3959 3.9239
##
## Coefficients:
## Estimate Std. Error z value
## (Intercept) -7.5046728 0.0858980 -87.367
## GenderMale 0.0486408 0.0214906 2.263
## Customer.TypeLoyal Customer 2.0788437 0.0319685 65.028
## Age -0.0095416 0.0007766 -12.286
## Type.of.TravelPersonal Travel -2.6682199 0.0339176 -78.668
## ClassEco -0.7077195 0.0270699 -26.144
## ClassEco Plus -0.7610253 0.0443803 -17.148
## Inflight.wifi.service 0.4257968 0.0128655 33.096
## Departure.Arrival.time.convenient -0.1240473 0.0088546 -14.009
## Ease.of.Online.booking -0.1819713 0.0126371 -14.400
## Gate.location 0.0388148 0.0099518 3.900
## Food.and.drink -0.0580765 0.0120987 -4.800
## Online.boarding 0.5993384 0.0112455 53.296
## Seat.comfort 0.0841054 0.0122212 6.882
## Inflight.entertainment 0.0350499 0.0163826 2.139
## On.board.service 0.3138995 0.0113705 27.606
## Leg.room.service 0.2568505 0.0093286 27.534
## Baggage.handling 0.1239279 0.0125491 9.875
## Checkin.service 0.3073560 0.0093740 32.788
## Inflight.service 0.0980817 0.0133884 7.326
## Cleanliness 0.2402937 0.0134716 17.837
## Arrival.Delay.in.Minutes -0.0502807 0.0022379 -22.467
## Pr(>|z|)
## (Intercept) < 0.0000000000000002 ***
## GenderMale 0.0236 *
## Customer.TypeLoyal Customer < 0.0000000000000002 ***
## Age < 0.0000000000000002 ***
## Type.of.TravelPersonal Travel < 0.0000000000000002 ***
## ClassEco < 0.0000000000000002 ***
## ClassEco Plus < 0.0000000000000002 ***
## Inflight.wifi.service < 0.0000000000000002 ***
## Departure.Arrival.time.convenient < 0.0000000000000002 ***
## Ease.of.Online.booking < 0.0000000000000002 ***
## Gate.location 0.000096082221676 ***
## Food.and.drink 0.000001584776660 ***
## Online.boarding < 0.0000000000000002 ***
## Seat.comfort 0.000000000005905 ***
## Inflight.entertainment 0.0324 *
## On.board.service < 0.0000000000000002 ***
## Leg.room.service < 0.0000000000000002 ***
## Baggage.handling < 0.0000000000000002 ***
## Checkin.service < 0.0000000000000002 ***
## Inflight.service 0.000000000000237 ***
## Cleanliness < 0.0000000000000002 ***
## Arrival.Delay.in.Minutes < 0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 115147 on 83619 degrees of freedom
## Residual deviance: 56854 on 83598 degrees of freedom
## AIC: 56898
##
## Number of Fisher Scoring iterations: 5
After running step, we get the same model
Testing data pre-processing
#loading the testing dataset
flight_survey.test <- read.csv('test.csv', header=TRUE)
colSums(is.na(flight_survey.test)) #only four observations that are missing values.
## X id
## 0 0
## Gender Customer.Type
## 0 0
## Age Type.of.Travel
## 0 0
## Class Flight.Distance
## 0 0
## Inflight.wifi.service Departure.Arrival.time.convenient
## 0 0
## Ease.of.Online.booking Gate.location
## 0 0
## Food.and.drink Online.boarding
## 0 0
## Seat.comfort Inflight.entertainment
## 0 0
## On.board.service Leg.room.service
## 0 0
## Baggage.handling Checkin.service
## 0 0
## Inflight.service Cleanliness
## 0 0
## Departure.Delay.in.Minutes Arrival.Delay.in.Minutes
## 0 83
## satisfaction
## 0
flight_survey.test<- na.omit(flight_survey.test)
# removing X and id
flight_survey.test <- flight_survey.test[, -c(1, 2)]
# data pre-processing
flight_survey.test$Gender <- as.factor(flight_survey.test$Gender)
flight_survey.test$Customer.Type <-as.factor(flight_survey.test$Customer.Type)
flight_survey.test$Type.of.Travel<-as.factor(flight_survey.test$Type.of.Travel)
flight_survey.test$Class<-as.factor(flight_survey.test$Class)
flight_survey.test$satisfaction<-as.factor(flight_survey.test$satisfaction)
levels(flight_survey.test$satisfaction) <- c("dissatisfied", "satisfied")
str(flight_survey.test)
## 'data.frame': 25893 obs. of 23 variables:
## $ Gender : Factor w/ 2 levels "Female","Male": 1 1 2 2 1 2 1 1 2 1 ...
## $ Customer.Type : Factor w/ 2 levels "disloyal Customer",..: 2 2 1 2 2 2 2 2 2 2 ...
## $ Age : int 52 36 20 44 49 16 77 43 47 46 ...
## $ Type.of.Travel : Factor w/ 2 levels "Business travel",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ Class : Factor w/ 3 levels "Business","Eco",..: 2 1 2 1 2 2 1 1 2 1 ...
## $ Flight.Distance : int 160 2863 192 3377 1182 311 3987 2556 556 1744 ...
## $ Inflight.wifi.service : int 5 1 2 0 2 3 5 2 5 2 ...
## $ Departure.Arrival.time.convenient: int 4 1 0 0 3 3 5 2 2 2 ...
## $ Ease.of.Online.booking : int 3 3 2 0 4 3 5 2 2 2 ...
## $ Gate.location : int 4 1 4 2 3 3 5 2 2 2 ...
## $ Food.and.drink : int 3 5 2 3 4 5 3 4 5 3 ...
## $ Online.boarding : int 4 4 2 4 1 5 5 4 5 4 ...
## $ Seat.comfort : int 3 5 2 4 2 3 5 5 5 4 ...
## $ Inflight.entertainment : int 5 4 2 1 2 5 5 4 5 4 ...
## $ On.board.service : int 5 4 4 1 2 4 5 4 2 4 ...
## $ Leg.room.service : int 5 4 1 1 2 3 5 4 2 4 ...
## $ Baggage.handling : int 5 4 3 1 2 1 5 4 5 4 ...
## $ Checkin.service : int 2 3 2 3 4 1 4 5 3 5 ...
## $ Inflight.service : int 5 4 2 1 2 2 5 4 3 4 ...
## $ Cleanliness : int 5 5 2 4 4 5 3 3 5 4 ...
## $ Departure.Delay.in.Minutes : int 50 0 0 0 0 0 0 77 1 28 ...
## $ Arrival.Delay.in.Minutes : num 44 0 0 6 20 0 0 65 0 14 ...
## $ satisfaction : Factor w/ 2 levels "dissatisfied",..: 2 2 1 2 2 2 2 2 2 2 ...
#predicting the satisfaction level
mdl1.pred <- predict(mdl1, flight_survey.test[,-23], type="response")
# first 5 actual and predicted records
data.frame(actual=flight_survey.test$satisfaction[1:5], predicted=mdl1.pred[1:5])
## actual predicted
## 1 satisfied 0.69941391
## 2 satisfied 0.87769925
## 3 dissatisfied 0.03870806
## 4 satisfied 0.31487670
## 5 satisfied 0.03355939
Setting the cutoff as 0.5
#classification by putting cut of 0.5
trainEstimatedResponse = ifelse(mdl1.pred > 0.5, "satisfied", "dissatisfied")
class(trainEstimatedResponse)
## [1] "character"
levels(as.factor(trainEstimatedResponse))
## [1] "dissatisfied" "satisfied"
# Accuracy, Estimation
table(flight_survey.test$satisfaction, trainEstimatedResponse)
## trainEstimatedResponse
## dissatisfied satisfied
## dissatisfied 13286 1242
## satisfied 2586 8779
mean(trainEstimatedResponse==flight_survey.test$satisfaction)
## [1] 0.8521608
library(caret)
library(e1071)
# confusion matrix
confusionMatrix(as.factor(ifelse(mdl1.pred>0.5, 'satisfied', 'dissatisfied')),
flight_survey.test$satisfaction, positive = "satisfied")
## Confusion Matrix and Statistics
##
## Reference
## Prediction dissatisfied satisfied
## dissatisfied 13286 2586
## satisfied 1242 8779
##
## Accuracy : 0.8522
## 95% CI : (0.8478, 0.8565)
## No Information Rate : 0.5611
## P-Value [Acc > NIR] : < 0.00000000000000022
##
## Kappa : 0.6959
##
## Mcnemar's Test P-Value : < 0.00000000000000022
##
## Sensitivity : 0.7725
## Specificity : 0.9145
## Pos Pred Value : 0.8761
## Neg Pred Value : 0.8371
## Prevalence : 0.4389
## Detection Rate : 0.3390
## Detection Prevalence : 0.3870
## Balanced Accuracy : 0.8435
##
## 'Positive' Class : satisfied
##
#computing accuracy per cutoff to select the best cutoff
accT = c()
for (cut in seq(0,1,0.1)){
cm <- confusionMatrix(as.factor(ifelse(mdl1.pred>cut, 'satisfied', 'dissatisfied')),
flight_survey.test$satisfaction, positive = "satisfied")
accT = c(accT, cm$overall[[1]])
}
## Warning in confusionMatrix.default(as.factor(ifelse(mdl1.pred > cut,
## "satisfied", : Levels are not in the same order for reference and data.
## Refactoring data to match.
## Warning in confusionMatrix.default(as.factor(ifelse(mdl1.pred > cut,
## "satisfied", : Levels are not in the same order for reference and data.
## Refactoring data to match.
# plot accuracy
plot(accT ~ seq(0,1,0.1), xlab = "Cutoff Value", ylab = "", type = "l", ylim = c(0, 1))
lines(1-accT ~ seq(0,1,0.1), type = "l", lty = 2)
legend("topright", c("accuracy", "overall error"), lty = c(1, 2), merge = TRUE)
#plotting the ROC curve
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
test_roc = roc(flight_survey.test$satisfaction ~ mdl1.pred, plot = TRUE, print.auc = T)
## Setting levels: control = dissatisfied, case = satisfied
## Setting direction: controls < cases
# compute auc
auc(test_roc)
## Area under the curve: 0.902
Selecting cutoff as 0.5 only, as the overall accuracy is high with the cut off and area under the curve is 92.6%
Creating default tree using rpart()
# default tree
set.seed(321)
default.ct <- rpart(satisfaction ~ .-Departure.Delay.in.Minutes , data = flight_survey, method = "class")
names(default.ct)
## [1] "frame" "where" "call"
## [4] "terms" "cptable" "method"
## [7] "parms" "control" "functions"
## [10] "numresp" "splits" "csplit"
## [13] "variable.importance" "y" "ordered"
#summary(default.ct)
default.ct$variable.importance
## Online.boarding Inflight.wifi.service Seat.comfort
## 14286.64627 11975.61158 6185.04966
## Ease.of.Online.booking Class Type.of.Travel
## 6174.78151 5433.58107 5082.20430
## Inflight.entertainment Age Leg.room.service
## 4433.65373 388.46768 125.52325
## Food.and.drink Cleanliness
## 24.42408 11.61714
length(default.ct$frame$var[default.ct$frame$var == "<leaf>"])
## [1] 6
# Plot tree
prp(default.ct, type = 2, extra = 1, under = TRUE, split.font = 1, varlen = -10, box.palette=c("red", "green"))
Checking the accuracy of the model
#Results
# Training data
default.ct.point.pred.train <- predict(default.ct, flight_survey[, -23],type = "class")
confusionMatrix(default.ct.point.pred.train, as.factor(flight_survey$satisfaction), positive = "satisfied")
## Confusion Matrix and Statistics
##
## Reference
## Prediction dissatisfied satisfied
## dissatisfied 39916 3575
## satisfied 5917 34212
##
## Accuracy : 0.8865
## 95% CI : (0.8843, 0.8886)
## No Information Rate : 0.5481
## P-Value [Acc > NIR] : < 0.00000000000000022
##
## Kappa : 0.7721
##
## Mcnemar's Test P-Value : < 0.00000000000000022
##
## Sensitivity : 0.9054
## Specificity : 0.8709
## Pos Pred Value : 0.8526
## Neg Pred Value : 0.9178
## Prevalence : 0.4519
## Detection Rate : 0.4091
## Detection Prevalence : 0.4799
## Balanced Accuracy : 0.8881
##
## 'Positive' Class : satisfied
##
# Testing data
default.ct.point.pred.test <- predict(default.ct,flight_survey.test[, -23],type = "class")
confusionMatrix(default.ct.point.pred.test, as.factor(flight_survey.test$satisfaction), positive = "satisfied")
## Confusion Matrix and Statistics
##
## Reference
## Prediction dissatisfied satisfied
## dissatisfied 12561 1042
## satisfied 1967 10323
##
## Accuracy : 0.8838
## 95% CI : (0.8798, 0.8877)
## No Information Rate : 0.5611
## P-Value [Acc > NIR] : < 0.00000000000000022
##
## Kappa : 0.7661
##
## Mcnemar's Test P-Value : < 0.00000000000000022
##
## Sensitivity : 0.9083
## Specificity : 0.8646
## Pos Pred Value : 0.8400
## Neg Pred Value : 0.9234
## Prevalence : 0.4389
## Detection Rate : 0.3987
## Detection Prevalence : 0.4746
## Balanced Accuracy : 0.8865
##
## 'Positive' Class : satisfied
##
Next, we use the cross validation proceedure, which gives an overfitted model
# using cp as 0.00001
cv.ct <- rpart(satisfaction ~ .-Departure.Delay.in.Minutes , data = flight_survey, method = "class",
cp = 0.00001, minsplit = 5, xval=5)
prp(cv.ct, type = 2, extra = 1, under = TRUE, split.font = 1, varlen = -10, box.palette=c("red", "green"))
## Warning: labs do not fit even at cex 0.15, there may be some overplotting
cv.ct.point.pred.test <- predict(cv.ct,flight_survey.test[, -23],type = "class")
confusionMatrix(cv.ct.point.pred.test, as.factor(flight_survey.test$satisfaction), positive = "satisfied")
## Confusion Matrix and Statistics
##
## Reference
## Prediction dissatisfied satisfied
## dissatisfied 13879 669
## satisfied 649 10696
##
## Accuracy : 0.9491
## 95% CI : (0.9464, 0.9517)
## No Information Rate : 0.5611
## P-Value [Acc > NIR] : <0.0000000000000002
##
## Kappa : 0.8966
##
## Mcnemar's Test P-Value : 0.6007
##
## Sensitivity : 0.9411
## Specificity : 0.9553
## Pos Pred Value : 0.9428
## Neg Pred Value : 0.9540
## Prevalence : 0.4389
## Detection Rate : 0.4131
## Detection Prevalence : 0.4381
## Balanced Accuracy : 0.9482
##
## 'Positive' Class : satisfied
##
#printcp(cv.ct)
# pruning the tree to avoid overfitting
pruned.ct <- prune(cv.ct,
cp = cv.ct$cptable[which.min(cv.ct$cptable[,"xerror"]),"CP"])
length(pruned.ct$frame$var[pruned.ct$frame$var == "<leaf>"])
## [1] 225
prp(pruned.ct, type = 2, extra = 1, split.font = 1, varlen = -10, box.palette=c("red", "green"))
## Warning: labs do not fit even at cex 0.15, there may be some overplotting
pruned.ct.point.pred.test <- predict(pruned.ct,flight_survey.test[, -23],type = "class")
confusionMatrix(pruned.ct.point.pred.test, as.factor(flight_survey.test$satisfaction), positive = "satisfied")
## Confusion Matrix and Statistics
##
## Reference
## Prediction dissatisfied satisfied
## dissatisfied 14119 713
## satisfied 409 10652
##
## Accuracy : 0.9567
## 95% CI : (0.9541, 0.9591)
## No Information Rate : 0.5611
## P-Value [Acc > NIR] : < 0.00000000000000022
##
## Kappa : 0.9118
##
## Mcnemar's Test P-Value : < 0.00000000000000022
##
## Sensitivity : 0.9373
## Specificity : 0.9718
## Pos Pred Value : 0.9630
## Neg Pred Value : 0.9519
## Prevalence : 0.4389
## Detection Rate : 0.4114
## Detection Prevalence : 0.4272
## Balanced Accuracy : 0.9546
##
## 'Positive' Class : satisfied
##
#pruning using a lower cp
set.seed(1234)
pruned.ct1 <- prune(cv.ct, cp=0.006)
prp(pruned.ct1, type = 1, extra = 1, under = TRUE, split.font = 1, varlen = -10,
box.palette=c("red", "green"))
pruned.ct.point.pred.test1 <- predict(pruned.ct1,flight_survey.test[, -23],type = "class")
confusionMatrix(pruned.ct.point.pred.test1, as.factor(flight_survey.test$satisfaction), positive = "satisfied")
## Confusion Matrix and Statistics
##
## Reference
## Prediction dissatisfied satisfied
## dissatisfied 12930 668
## satisfied 1598 10697
##
## Accuracy : 0.9125
## 95% CI : (0.909, 0.9159)
## No Information Rate : 0.5611
## P-Value [Acc > NIR] : < 0.00000000000000022
##
## Kappa : 0.8239
##
## Mcnemar's Test P-Value : < 0.00000000000000022
##
## Sensitivity : 0.9412
## Specificity : 0.8900
## Pos Pred Value : 0.8700
## Neg Pred Value : 0.9509
## Prevalence : 0.4389
## Detection Rate : 0.4131
## Detection Prevalence : 0.4748
## Balanced Accuracy : 0.9156
##
## 'Positive' Class : satisfied
##
# plotting the ROC for the final prunned tree
library("ROCR")
Pred.cart = predict(pruned.ct1, newdata = flight_survey.test[, -23], type = "prob")[,2]
Pred2 = prediction(Pred.cart, flight_survey.test$satisfaction)
plot(performance(Pred2, "tpr", "fpr"))
abline(0, 1, lty = 2)
auc = performance(Pred2, 'auc')
slot(auc, 'y.values')
## [[1]]
## [1] 0.9543035
We can also create a random forest to increase the accuracy of the model
# random forest
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
rf <- randomForest(as.factor(satisfaction) ~.-Departure.Delay.in.Minutes , data = flight_survey, ntree = 500,
mtry = 4, nodesize = 5, importance = TRUE, parms = list(loss = lossmatrix))
# important variables
varImpPlot(rf)
rf.predict <- predict(rf ,flight_survey.test[, -23],type = "class")
confusionMatrix(rf.predict, as.factor(flight_survey.test$satisfaction))
## Confusion Matrix and Statistics
##
## Reference
## Prediction dissatisfied satisfied
## dissatisfied 14218 660
## satisfied 310 10705
##
## Accuracy : 0.9625
## 95% CI : (0.9602, 0.9648)
## No Information Rate : 0.5611
## P-Value [Acc > NIR] : < 0.00000000000000022
##
## Kappa : 0.9237
##
## Mcnemar's Test P-Value : < 0.00000000000000022
##
## Sensitivity : 0.9787
## Specificity : 0.9419
## Pos Pred Value : 0.9556
## Neg Pred Value : 0.9719
## Prevalence : 0.5611
## Detection Rate : 0.5491
## Detection Prevalence : 0.5746
## Balanced Accuracy : 0.9603
##
## 'Positive' Class : dissatisfied
##
#plotting the ROC curve
predictions <- as.numeric(predict(rf, flight_survey.test[, -23], type="response"))
pred <- prediction(predictions, flight_survey.test$satisfaction)
perf <- performance(pred, measure = "tpr", x.measure = "fpr")
plot(perf, col=rainbow(10))
auc1<- performance(pred,"auc")
print(auc1)
## A performance instance
## 'Area under the ROC curve'
slot(auc1, 'y.values')
## [[1]]
## [1] 0.9603193
In the base layer we added logistic model, decision tree, and random forest
small.index <- createDataPartition(flight_survey$satisfaction, p = 0.05, list = FALSE)
flight_survey.small <- flight_survey[small.index, ]
set.seed(4321)
control_stacking <- trainControl(method="repeatedcv", number=5, repeats=2, savePredictions=TRUE, classProbs=TRUE)
algorithms_to_use <- c('rpart', 'glm', 'rf')
stacked_models <- caretList(satisfaction ~.-Departure.Delay.in.Minutes , data = flight_survey.small, trControl = control_stacking, methodList = algorithms_to_use)
## Warning in trControlCheck(x = trControl, y = target): x$savePredictions == TRUE
## is depreciated. Setting to 'final' instead.
## Warning in trControlCheck(x = trControl, y = target): indexes not defined in
## trControl. Attempting to set them ourselves, so each model in the ensemble will
## have the same resampling indexes.
stacking_results <- resamples(stacked_models)
summary(stacking_results)
##
## Call:
## summary.resamples(object = stacking_results)
##
## Models: rpart, glm, rf
## Number of resamples: 10
##
## Accuracy
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## rpart 0.8112306 0.8304919 0.8529573 0.8457705 0.8618850 0.8696172 0
## glm 0.8480861 0.8576980 0.8714115 0.8656150 0.8744019 0.8769415 0
## rf 0.9222488 0.9336918 0.9408245 0.9383062 0.9434809 0.9498208 0
##
## Kappa
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## rpart 0.6145735 0.6538248 0.7024074 0.6865406 0.7193875 0.7367637 0
## glm 0.6917614 0.7112468 0.7394045 0.7276708 0.7455144 0.7521925 0
## rf 0.8433139 0.8663683 0.8803616 0.8753990 0.8855278 0.8985981 0
Next, these results are stacked on the logistic regression model
stackControl <- trainControl(method="repeatedcv", number=5, repeats=3, savePredictions=TRUE, classProbs=TRUE)
set.seed(100)
glm_stack <- caretStack(stacked_models, method="glm", metric="Accuracy", trControl=stackControl)
print(glm_stack)
## A glm ensemble of 3 base models: rpart, glm, rf
##
## Ensemble results:
## Generalized Linear Model
##
## 8364 samples
## 3 predictor
## 2 classes: 'dissatisfied', 'satisfied'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold, repeated 3 times)
## Summary of sample sizes: 6691, 6692, 6691, 6691, 6691, 6692, ...
## Resampling results:
##
## Accuracy Kappa
## 0.9379883 0.8746998
Based ont the classification results, arrival delays lead to reduction in satisfaction probability. Therefore, next we explore the arrival delays to see what factors contribute the most in delays.
We begin by loading the dataset
flight_delay <- read.csv('delayml.csv')
flight_delay <- na.omit(flight_delay)
str(flight_delay)
## 'data.frame': 228528 obs. of 31 variables:
## $ YEAR : int 2015 2015 2015 2015 2015 2015 2015 2015 2015 2015 ...
## $ MONTH : int 1 1 1 1 1 1 1 1 1 1 ...
## $ DAY : int 1 1 1 1 1 1 1 1 1 1 ...
## $ DAY_OF_WEEK : int 4 4 4 4 4 4 4 4 4 4 ...
## $ AIRLINE : chr "NK" "NK" "HA" "B6" ...
## $ FLIGHT_NUMBER : int 597 168 17 1030 2134 2276 1057 425 89 328 ...
## $ TAIL_NUMBER : chr "N528NK" "N629NK" "N389HA" "N239JB" ...
## $ ORIGIN_AIRPORT : chr "MSP" "PHX" "LAS" "BQN" ...
## $ DESTINATION_AIRPORT: chr "FLL" "ORD" "HNL" "MCO" ...
## $ SCHEDULED_DEPARTURE: int 115 125 145 307 400 438 515 520 520 530 ...
## $ DEPARTURE_TIME : int 127 237 145 304 535 550 703 620 618 623 ...
## $ DEPARTURE_DELAY : int 12 72 0 -3 95 72 108 60 58 53 ...
## $ TAXI_OUT : int 14 9 16 25 9 15 15 13 19 32 ...
## $ WHEELS_OFF : int 141 246 201 329 544 605 718 633 637 655 ...
## $ SCHEDULED_TIME : int 207 204 370 173 185 241 161 150 141 125 ...
## $ ELAPSED_TIME : int 220 175 385 196 175 258 155 150 137 138 ...
## $ AIR_TIME : int 166 156 361 160 163 237 133 132 111 96 ...
## $ DISTANCE : int 1487 1440 2762 1129 1189 1666 1121 1009 964 641 ...
## $ WHEELS_ON : int 527 622 602 509 727 902 1031 945 928 931 ...
## $ TAXI_IN : int 40 10 8 11 3 6 7 5 7 10 ...
## $ SCHEDULED_ARRIVAL : int 542 549 555 500 605 739 856 850 841 835 ...
## $ ARRIVAL_TIME : int 607 632 610 520 730 908 1038 950 935 941 ...
## $ ARRIVAL_DELAY : int 25 43 15 20 85 89 102 60 54 66 ...
## $ DIVERTED : int 0 0 0 0 0 0 0 0 0 0 ...
## $ CANCELLED : int 0 0 0 0 0 0 0 0 0 0 ...
## $ CANCELLATION_REASON: chr "" "" "" "" ...
## $ AIR_SYSTEM_DELAY : int 25 43 0 20 0 17 0 0 0 13 ...
## $ SECURITY_DELAY : int 0 0 0 0 0 0 0 0 0 0 ...
## $ AIRLINE_DELAY : int 0 0 15 0 85 72 0 60 54 53 ...
## $ LATE_AIRCRAFT_DELAY: int 0 0 0 0 0 0 0 0 0 0 ...
## $ WEATHER_DELAY : int 0 0 0 0 0 0 102 0 0 0 ...
## - attr(*, "na.action")= 'omit' Named int [1:820047] 1 2 3 4 5 6 7 8 9 10 ...
## ..- attr(*, "names")= chr [1:820047] "1" "2" "3" "4" ...
Next, we pre-process the data
flight_delay$DAY_OF_WEEK <- as.factor(flight_delay$DAY_OF_WEEK)
ggplot(flight_delay, aes(ARRIVAL_DELAY)) +
geom_histogram(aes(y=..count..),
fill="#c7ceea",
alpha = 0.8,
color="black",
bins = 30) +
labs(x = "Arrival Delay", y = "Frequency")
# departure delays and arrival delays
ggplot(flight_delay, aes(x=DEPARTURE_DELAY, y=ARRIVAL_DELAY)) + geom_point()
# taxi in and arrival delays
ggplot(flight_delay, aes(x=TAXI_IN, y=ARRIVAL_DELAY)) + geom_point()
# taxi out and arrival delays
ggplot(flight_delay, aes(x=TAXI_OUT, y=ARRIVAL_DELAY)) + geom_point()
Dividing data in training and testing
set.seed(123)
sample_size = round(nrow(flight_delay)*.80) # 80/20 rule
train_ind <- sample(seq_len(nrow(flight_delay)), size = sample_size)
flight_delay_train <- flight_delay[train_ind,]
flight_delay_test <- flight_delay[-train_ind,]
# setting the cross validation set
tr1 <- trainControl(method="cv", number=10)
#Model Building
myreg <- lm(ARRIVAL_DELAY ~ TAXI_OUT + TAXI_IN+ DEPARTURE_DELAY + DISTANCE + DAY_OF_WEEK, data = flight_delay_train, trainControl=tr1)
## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
## extra argument 'trainControl' will be disregarded
summary(myreg)
##
## Call:
## lm(formula = ARRIVAL_DELAY ~ TAXI_OUT + TAXI_IN + DEPARTURE_DELAY +
## DISTANCE + DAY_OF_WEEK, data = flight_delay_train, trainControl = tr1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -69.724 -6.636 -0.319 6.042 103.026
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -17.75452581 0.08789556 -201.996 < 0.0000000000000002 ***
## TAXI_OUT 0.84791029 0.00163643 518.146 < 0.0000000000000002 ***
## TAXI_IN 0.86556765 0.00255199 339.174 < 0.0000000000000002 ***
## DEPARTURE_DELAY 0.95783968 0.00040322 2375.476 < 0.0000000000000002 ***
## DISTANCE -0.00124128 0.00004415 -28.117 < 0.0000000000000002 ***
## DAY_OF_WEEK2 0.16797471 0.09452577 1.777 0.07557 .
## DAY_OF_WEEK3 0.11942950 0.09863230 1.211 0.22595
## DAY_OF_WEEK4 0.09653473 0.09137618 1.056 0.29076
## DAY_OF_WEEK5 -0.23932816 0.09167694 -2.611 0.00904 **
## DAY_OF_WEEK6 0.75701216 0.10159176 7.452 0.0000000000000927 ***
## DAY_OF_WEEK7 1.20785985 0.09057564 13.335 < 0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 11.15 on 182811 degrees of freedom
## Multiple R-squared: 0.9691, Adjusted R-squared: 0.9691
## F-statistic: 5.737e+05 on 10 and 182811 DF, p-value: < 0.00000000000000022
myreg_1 <- lm(ARRIVAL_DELAY ~ TAXI_OUT + TAXI_IN+ DISTANCE + DAY_OF_WEEK + DEPARTURE_DELAY + AIR_SYSTEM_DELAY + SECURITY_DELAY + AIRLINE_DELAY + WEATHER_DELAY, data = flight_delay_train, trainControl=tr)
## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
## extra argument 'trainControl' will be disregarded
summary(myreg_1)
##
## Call:
## lm(formula = ARRIVAL_DELAY ~ TAXI_OUT + TAXI_IN + DISTANCE +
## DAY_OF_WEEK + DEPARTURE_DELAY + AIR_SYSTEM_DELAY + SECURITY_DELAY +
## AIRLINE_DELAY + WEATHER_DELAY, data = flight_delay_train,
## trainControl = tr)
##
## Residuals:
## Min 1Q Median 3Q Max
## -110.597 -5.867 0.028 5.772 110.101
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -13.92128306 0.08371675 -166.290 < 0.0000000000000002 ***
## TAXI_OUT 0.67579239 0.00177733 380.229 < 0.0000000000000002 ***
## TAXI_IN 0.69579181 0.00252829 275.202 < 0.0000000000000002 ***
## DISTANCE -0.00147131 0.00004061 -36.230 < 0.0000000000000002 ***
## DAY_OF_WEEK2 0.18340430 0.08684220 2.112 0.0347 *
## DAY_OF_WEEK3 0.05091912 0.09063409 0.562 0.5742
## DAY_OF_WEEK4 0.06992588 0.08394805 0.833 0.4049
## DAY_OF_WEEK5 -0.25467196 0.08423840 -3.023 0.0025 **
## DAY_OF_WEEK6 0.66058868 0.09337559 7.075 0.0000000000015 ***
## DAY_OF_WEEK7 1.05557750 0.08322058 12.684 < 0.0000000000000002 ***
## DEPARTURE_DELAY 0.91028726 0.00054981 1655.641 < 0.0000000000000002 ***
## AIR_SYSTEM_DELAY 0.20758286 0.00116709 177.863 < 0.0000000000000002 ***
## SECURITY_DELAY 0.07163513 0.01319258 5.430 0.0000000564388 ***
## AIRLINE_DELAY 0.06220866 0.00072552 85.743 < 0.0000000000000002 ***
## WEATHER_DELAY 0.07871302 0.00114855 68.533 < 0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 10.25 on 182807 degrees of freedom
## Multiple R-squared: 0.9739, Adjusted R-squared: 0.9739
## F-statistic: 4.879e+05 on 14 and 182807 DF, p-value: < 0.00000000000000022
library(forecast)
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
##
## Attaching package: 'forecast'
## The following object is masked from 'package:caretEnsemble':
##
## autoplot
# Model 1
pred_values = predict(myreg, newdata = flight_delay_test)
flight_delay_test$pred_ad = pred_values
all.residuals <- flight_delay_test$ARRIVAL_DELAY - pred_values
My_residuals<-data.frame("Predicted" = pred_values, "Actual" = flight_delay_test$ARRIVAL_DELAY,
"Residual" = all.residuals)
head(My_residuals)
## Predicted Actual Residual
## 31 65.80590 43 -22.805896
## 53 82.08880 85 2.911204
## 56 67.15056 89 21.849441
## 107 143.33181 128 -15.331809
## 112 21.76651 25 3.233488
## 172 108.22121 116 7.778790
accuracy(pred_values, flight_delay_test$ARRIVAL_DELAY)
## ME RMSE MAE MPE MAPE
## Test set 0.02267564 11.17065 8.290572 -1.851053 23.88622
# Model 2
pred_values_1 = predict(myreg_1, newdata = flight_delay_test)
flight_delay_test$pred_ad_1 = pred_values_1
all.residuals_1 <- flight_delay_test$ARRIVAL_DELAY - pred_values_1
My_residuals_1<-data.frame("Predicted" = pred_values_1, "Actual" = flight_delay_test$ARRIVAL_DELAY,
"Residual" = all.residuals_1)
head(My_residuals_1)
## Predicted Actual Residual
## 31 71.53675 43 -28.5367542
## 53 84.33379 85 0.6662097
## 56 71.55769 89 17.4423051
## 107 145.50267 128 -17.5026745
## 112 22.04891 25 2.9510865
## 172 112.75542 116 3.2445843
accuracy(pred_values_1, flight_delay_test$ARRIVAL_DELAY)
## ME RMSE MAE MPE MAPE
## Test set 0.01640536 10.22251 7.588153 -1.80699 20.9394
We begin by loading libraries and storing dataset as a dataframe
flight_cluster <- read.csv('test.csv', header = TRUE)
Pre processing the dataset
flight_cluster$Gender<-as.factor(flight_cluster$Gender)
flight_cluster$Customer.Type<-as.factor(flight_cluster$Customer.Type)
flight_cluster$Type.of.Travel<-as.factor(flight_cluster$Type.of.Travel)
flight_cluster$Class<-as.factor(flight_cluster$Class)
flight_cluster$satisfaction<-as.factor(flight_cluster$satisfaction)
flight_cluster$Gender<-unclass(flight_cluster$Gender)
flight_cluster$Customer.Type<-unclass(flight_cluster$Customer.Type)
flight_cluster$Type.of.Travel<-unclass(flight_cluster$Type.of.Travel)
flight_cluster$Class<-unclass(flight_cluster$Class)
flight_cluster$satisfaction<-unclass(flight_cluster$satisfaction)
flight_cluster$Arrival.Delay.in.Minutes<-as.integer(flight_cluster$Arrival.Delay.in.Minutes)
str(flight_cluster) #converting all variables to integers.
## 'data.frame': 25976 obs. of 25 variables:
## $ X : int 0 1 2 3 4 5 6 7 8 9 ...
## $ id : int 19556 90035 12360 77959 36875 39177 79433 97286 27508 62482 ...
## $ Gender : int 1 1 2 2 1 2 1 1 2 1 ...
## ..- attr(*, "levels")= chr [1:2] "Female" "Male"
## $ Customer.Type : int 2 2 1 2 2 2 2 2 2 2 ...
## ..- attr(*, "levels")= chr [1:2] "disloyal Customer" "Loyal Customer"
## $ Age : int 52 36 20 44 49 16 77 43 47 46 ...
## $ Type.of.Travel : int 1 1 1 1 1 1 1 1 1 1 ...
## ..- attr(*, "levels")= chr [1:2] "Business travel" "Personal Travel"
## $ Class : int 2 1 2 1 2 2 1 1 2 1 ...
## ..- attr(*, "levels")= chr [1:3] "Business" "Eco" "Eco Plus"
## $ Flight.Distance : int 160 2863 192 3377 1182 311 3987 2556 556 1744 ...
## $ Inflight.wifi.service : int 5 1 2 0 2 3 5 2 5 2 ...
## $ Departure.Arrival.time.convenient: int 4 1 0 0 3 3 5 2 2 2 ...
## $ Ease.of.Online.booking : int 3 3 2 0 4 3 5 2 2 2 ...
## $ Gate.location : int 4 1 4 2 3 3 5 2 2 2 ...
## $ Food.and.drink : int 3 5 2 3 4 5 3 4 5 3 ...
## $ Online.boarding : int 4 4 2 4 1 5 5 4 5 4 ...
## $ Seat.comfort : int 3 5 2 4 2 3 5 5 5 4 ...
## $ Inflight.entertainment : int 5 4 2 1 2 5 5 4 5 4 ...
## $ On.board.service : int 5 4 4 1 2 4 5 4 2 4 ...
## $ Leg.room.service : int 5 4 1 1 2 3 5 4 2 4 ...
## $ Baggage.handling : int 5 4 3 1 2 1 5 4 5 4 ...
## $ Checkin.service : int 2 3 2 3 4 1 4 5 3 5 ...
## $ Inflight.service : int 5 4 2 1 2 2 5 4 3 4 ...
## $ Cleanliness : int 5 5 2 4 4 5 3 3 5 4 ...
## $ Departure.Delay.in.Minutes : int 50 0 0 0 0 0 0 77 1 28 ...
## $ Arrival.Delay.in.Minutes : int 44 0 0 6 20 0 0 65 0 14 ...
## $ satisfaction : int 2 2 1 2 2 2 2 2 2 2 ...
## ..- attr(*, "levels")= chr [1:2] "neutral or dissatisfied" "satisfied"
iqr <- IQR(flight_cluster$Departure.Delay.in.Minutes) #removing outliers in departure delays
Q <- quantile(flight_cluster$Departure.Delay.in.Minutes, probs=c(.25, .75), na.rm = FALSE)
flight_cluster<- subset(flight_cluster, flight_cluster$Departure.Delay.in.Minutes > (Q[1] - 2.5*iqr) & flight_cluster$Departure.Delay.in.Minutes < (Q[2]+2.5*iqr))
iqr2 <- IQR(flight_cluster$Arrival.Delay.in.Minutes, na.rm = TRUE) #removing outliers in arrival delays
Q1 <- quantile(flight_cluster$Arrival.Delay.in.Minutes, probs=c(.25, .75), na.rm = TRUE)
flight_cluster<- subset(flight_cluster, flight_cluster$Arrival.Delay.in.Minutes > (Q1[1] - 2.5*iqr2) & flight_cluster$Arrival.Delay.in.Minutes < (Q1[2]+2.5*iqr2))
summary(flight_cluster)
## X id Gender Customer.Type
## Min. : 1 Min. : 17 Min. :1.000 Min. :1.000
## 1st Qu.: 6544 1st Qu.: 32899 1st Qu.:1.000 1st Qu.:2.000
## Median :12978 Median : 65920 Median :1.000 Median :2.000
## Mean :13005 Mean : 65379 Mean :1.493 Mean :1.818
## 3rd Qu.:19521 3rd Qu.: 96972 3rd Qu.:2.000 3rd Qu.:2.000
## Max. :25975 Max. :129877 Max. :2.000 Max. :2.000
## Age Type.of.Travel Class Flight.Distance
## Min. : 7.00 Min. :1.000 Min. :1.000 Min. : 31
## 1st Qu.:27.00 1st Qu.:1.000 1st Qu.:1.000 1st Qu.: 409
## Median :40.00 Median :1.000 Median :2.000 Median : 853
## Mean :39.67 Mean :1.308 Mean :1.587 Mean :1197
## 3rd Qu.:51.00 3rd Qu.:2.000 3rd Qu.:2.000 3rd Qu.:1750
## Max. :85.00 Max. :2.000 Max. :3.000 Max. :4983
## Inflight.wifi.service Departure.Arrival.time.convenient Ease.of.Online.booking
## Min. :0.000 Min. :0.000 Min. :0.000
## 1st Qu.:2.000 1st Qu.:2.000 1st Qu.:2.000
## Median :3.000 Median :3.000 Median :3.000
## Mean :2.735 Mean :3.047 Mean :2.758
## 3rd Qu.:4.000 3rd Qu.:4.000 3rd Qu.:4.000
## Max. :5.000 Max. :5.000 Max. :5.000
## Gate.location Food.and.drink Online.boarding Seat.comfort
## Min. :1.000 Min. :0.000 Min. :0.000 Min. :1.00
## 1st Qu.:2.000 1st Qu.:2.000 1st Qu.:2.000 1st Qu.:2.00
## Median :3.000 Median :3.000 Median :4.000 Median :4.00
## Mean :2.967 Mean :3.232 Mean :3.282 Mean :3.47
## 3rd Qu.:4.000 3rd Qu.:4.000 3rd Qu.:4.000 3rd Qu.:5.00
## Max. :5.000 Max. :5.000 Max. :5.000 Max. :5.00
## Inflight.entertainment On.board.service Leg.room.service Baggage.handling
## Min. :0.000 Min. :0.000 Min. :0.000 Min. :1.000
## 1st Qu.:2.000 1st Qu.:3.000 1st Qu.:2.000 1st Qu.:3.000
## Median :4.000 Median :4.000 Median :4.000 Median :4.000
## Mean :3.379 Mean :3.408 Mean :3.356 Mean :3.644
## 3rd Qu.:5.000 3rd Qu.:4.000 3rd Qu.:4.000 3rd Qu.:5.000
## Max. :5.000 Max. :5.000 Max. :5.000 Max. :5.000
## Checkin.service Inflight.service Cleanliness Departure.Delay.in.Minutes
## Min. :1.000 Min. :0.000 Min. :1.000 Min. : 0.000
## 1st Qu.:3.000 1st Qu.:3.000 1st Qu.:2.000 1st Qu.: 0.000
## Median :3.000 Median :4.000 Median :3.000 Median : 0.000
## Mean :3.331 Mean :3.677 Mean :3.304 Mean : 3.103
## 3rd Qu.:4.000 3rd Qu.:5.000 3rd Qu.:4.000 3rd Qu.: 3.000
## Max. :5.000 Max. :5.000 Max. :5.000 Max. :41.000
## Arrival.Delay.in.Minutes satisfaction
## Min. : 0.000 Min. :1.000
## 1st Qu.: 0.000 1st Qu.:1.000
## Median : 0.000 Median :1.000
## Mean : 2.546 Mean :1.456
## 3rd Qu.: 3.000 3rd Qu.:2.000
## Max. :20.000 Max. :2.000
summary(flight_cluster$satisfaction)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 1.000 1.000 1.456 2.000 2.000
Preparing the dataset for PAM clustering
flight_PAM <- read.csv('test.csv', header = TRUE)
flight_PAM$Gender<-as.factor(flight_PAM$Gender)
flight_PAM$Customer.Type<-as.factor(flight_PAM$Customer.Type)
flight_PAM$Type.of.Travel<-as.factor(flight_PAM$Type.of.Travel)
flight_PAM$Class<-as.factor(flight_PAM$Class)
flight_PAM$satisfaction<-as.factor(flight_PAM$satisfaction)
flight_PAM$Arrival.Delay.in.Minutes<-as.integer(flight_PAM$Arrival.Delay.in.Minutes)
str(flight_PAM)
## 'data.frame': 25976 obs. of 25 variables:
## $ X : int 0 1 2 3 4 5 6 7 8 9 ...
## $ id : int 19556 90035 12360 77959 36875 39177 79433 97286 27508 62482 ...
## $ Gender : Factor w/ 2 levels "Female","Male": 1 1 2 2 1 2 1 1 2 1 ...
## $ Customer.Type : Factor w/ 2 levels "disloyal Customer",..: 2 2 1 2 2 2 2 2 2 2 ...
## $ Age : int 52 36 20 44 49 16 77 43 47 46 ...
## $ Type.of.Travel : Factor w/ 2 levels "Business travel",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ Class : Factor w/ 3 levels "Business","Eco",..: 2 1 2 1 2 2 1 1 2 1 ...
## $ Flight.Distance : int 160 2863 192 3377 1182 311 3987 2556 556 1744 ...
## $ Inflight.wifi.service : int 5 1 2 0 2 3 5 2 5 2 ...
## $ Departure.Arrival.time.convenient: int 4 1 0 0 3 3 5 2 2 2 ...
## $ Ease.of.Online.booking : int 3 3 2 0 4 3 5 2 2 2 ...
## $ Gate.location : int 4 1 4 2 3 3 5 2 2 2 ...
## $ Food.and.drink : int 3 5 2 3 4 5 3 4 5 3 ...
## $ Online.boarding : int 4 4 2 4 1 5 5 4 5 4 ...
## $ Seat.comfort : int 3 5 2 4 2 3 5 5 5 4 ...
## $ Inflight.entertainment : int 5 4 2 1 2 5 5 4 5 4 ...
## $ On.board.service : int 5 4 4 1 2 4 5 4 2 4 ...
## $ Leg.room.service : int 5 4 1 1 2 3 5 4 2 4 ...
## $ Baggage.handling : int 5 4 3 1 2 1 5 4 5 4 ...
## $ Checkin.service : int 2 3 2 3 4 1 4 5 3 5 ...
## $ Inflight.service : int 5 4 2 1 2 2 5 4 3 4 ...
## $ Cleanliness : int 5 5 2 4 4 5 3 3 5 4 ...
## $ Departure.Delay.in.Minutes : int 50 0 0 0 0 0 0 77 1 28 ...
## $ Arrival.Delay.in.Minutes : int 44 0 0 6 20 0 0 65 0 14 ...
## $ satisfaction : Factor w/ 2 levels "neutral or dissatisfied",..: 2 2 1 2 2 2 2 2 2 2 ...
iqr <- IQR(flight_PAM$Departure.Delay.in.Minutes)
Q <- quantile(flight_PAM$Departure.Delay.in.Minutes, probs=c(.25, .75), na.rm = FALSE)
flight_PAM<- subset(flight_PAM, flight_PAM$Departure.Delay.in.Minutes > (Q[1] - 2.5*iqr) & flight_PAM$Departure.Delay.in.Minutes < (Q[2]+2.5*iqr))
iqr2 <- IQR(flight_PAM$Arrival.Delay.in.Minutes, na.rm = TRUE)
Q1 <- quantile(flight_PAM$Arrival.Delay.in.Minutes, probs=c(.25, .75), na.rm = TRUE)
flight_PAM<- subset(flight_PAM, flight_PAM$Arrival.Delay.in.Minutes > (Q1[1] - 2.5*iqr2) & flight_PAM$Arrival.Delay.in.Minutes < (Q1[2]+2.5*iqr2))
summary(flight_PAM)
## X id Gender Customer.Type
## Min. : 1 Min. : 17 Female:10603 disloyal Customer: 3810
## 1st Qu.: 6544 1st Qu.: 32899 Male :10309 Loyal Customer :17102
## Median :12978 Median : 65920
## Mean :13005 Mean : 65379
## 3rd Qu.:19521 3rd Qu.: 96972
## Max. :25975 Max. :129877
## Age Type.of.Travel Class Flight.Distance
## Min. : 7.00 Business travel:14461 Business:10157 Min. : 31
## 1st Qu.:27.00 Personal Travel: 6451 Eco : 9228 1st Qu.: 409
## Median :40.00 Eco Plus: 1527 Median : 853
## Mean :39.67 Mean :1197
## 3rd Qu.:51.00 3rd Qu.:1750
## Max. :85.00 Max. :4983
## Inflight.wifi.service Departure.Arrival.time.convenient Ease.of.Online.booking
## Min. :0.000 Min. :0.000 Min. :0.000
## 1st Qu.:2.000 1st Qu.:2.000 1st Qu.:2.000
## Median :3.000 Median :3.000 Median :3.000
## Mean :2.735 Mean :3.047 Mean :2.758
## 3rd Qu.:4.000 3rd Qu.:4.000 3rd Qu.:4.000
## Max. :5.000 Max. :5.000 Max. :5.000
## Gate.location Food.and.drink Online.boarding Seat.comfort
## Min. :1.000 Min. :0.000 Min. :0.000 Min. :1.00
## 1st Qu.:2.000 1st Qu.:2.000 1st Qu.:2.000 1st Qu.:2.00
## Median :3.000 Median :3.000 Median :4.000 Median :4.00
## Mean :2.967 Mean :3.232 Mean :3.282 Mean :3.47
## 3rd Qu.:4.000 3rd Qu.:4.000 3rd Qu.:4.000 3rd Qu.:5.00
## Max. :5.000 Max. :5.000 Max. :5.000 Max. :5.00
## Inflight.entertainment On.board.service Leg.room.service Baggage.handling
## Min. :0.000 Min. :0.000 Min. :0.000 Min. :1.000
## 1st Qu.:2.000 1st Qu.:3.000 1st Qu.:2.000 1st Qu.:3.000
## Median :4.000 Median :4.000 Median :4.000 Median :4.000
## Mean :3.379 Mean :3.408 Mean :3.356 Mean :3.644
## 3rd Qu.:5.000 3rd Qu.:4.000 3rd Qu.:4.000 3rd Qu.:5.000
## Max. :5.000 Max. :5.000 Max. :5.000 Max. :5.000
## Checkin.service Inflight.service Cleanliness Departure.Delay.in.Minutes
## Min. :1.000 Min. :0.000 Min. :1.000 Min. : 0.000
## 1st Qu.:3.000 1st Qu.:3.000 1st Qu.:2.000 1st Qu.: 0.000
## Median :3.000 Median :4.000 Median :3.000 Median : 0.000
## Mean :3.331 Mean :3.677 Mean :3.304 Mean : 3.103
## 3rd Qu.:4.000 3rd Qu.:5.000 3rd Qu.:4.000 3rd Qu.: 3.000
## Max. :5.000 Max. :5.000 Max. :5.000 Max. :41.000
## Arrival.Delay.in.Minutes satisfaction
## Min. : 0.000 neutral or dissatisfied:11367
## 1st Qu.: 0.000 satisfied : 9545
## Median : 0.000
## Mean : 2.546
## 3rd Qu.: 3.000
## Max. :20.000
summary(flight_PAM$satisfaction)
## neutral or dissatisfied satisfied
## 11367 9545
hist(flight_cluster$satisfaction, data=flight_cluster, main="Distribution of Satisfaction", xlab="Customer Satisfaction")
## Warning in plot.window(xlim, ylim, "", ...): "data" is not a graphical parameter
## Warning in title(main = main, sub = sub, xlab = xlab, ylab = ylab, ...): "data"
## is not a graphical parameter
## Warning in axis(1, ...): "data" is not a graphical parameter
## Warning in axis(2, ...): "data" is not a graphical parameter
set.seed(1)
smaller.index <-createDataPartition(flight_cluster$satisfaction, p = 0.15, list = FALSE)
#Using create data partition function due to uneven distribution of satisfaction.
flight_clean <- flight_cluster[smaller.index, ]#could not interpret the entire data set due to processing power limitations.
smaller.indexPAM <-createDataPartition(flight_PAM$satisfaction, p = 0.15, list = FALSE) #This is for PAM Clustering
flight_clean_PAM <- flight_PAM[smaller.indexPAM, ]
set.seed(1)
smaller.index.test <-createDataPartition(flight_clean$satisfaction, p = 0.95, list = FALSE)
flight_clean_test <- flight_cluster[smaller.index.test, ]
colSums(is.na(flight_clean)) #only four observations that are missing values.
## X id
## 0 0
## Gender Customer.Type
## 0 0
## Age Type.of.Travel
## 0 0
## Class Flight.Distance
## 0 0
## Inflight.wifi.service Departure.Arrival.time.convenient
## 0 0
## Ease.of.Online.booking Gate.location
## 0 0
## Food.and.drink Online.boarding
## 0 0
## Seat.comfort Inflight.entertainment
## 0 0
## On.board.service Leg.room.service
## 0 0
## Baggage.handling Checkin.service
## 0 0
## Inflight.service Cleanliness
## 0 0
## Departure.Delay.in.Minutes Arrival.Delay.in.Minutes
## 0 0
## satisfaction
## 0
flight_clean_PAM <- na.omit(flight_clean_PAM)
flight_clean <- na.omit(flight_clean)
flight_clean_test <- na.omit(flight_clean_test)
flight_clean <- flight_clean[-c(1,2)]
flight_clean_PAM <- flight_clean_PAM[-c(1,2)]
flight_clean_test <- flight_clean_test[-c(1,2)]
head(flight_clean,3)
## Gender Customer.Type Age Type.of.Travel Class Flight.Distance
## 17 1 2 31 1 2 728
## 33 2 1 41 1 2 624
## 48 1 2 59 2 2 460
## Inflight.wifi.service Departure.Arrival.time.convenient
## 17 2 5
## 33 2 3
## 48 2 5
## Ease.of.Online.booking Gate.location Food.and.drink Online.boarding
## 17 5 5 2 2
## 33 2 4 5 2
## 48 2 5 3 3
## Seat.comfort Inflight.entertainment On.board.service Leg.room.service
## 17 2 2 4 3
## 33 5 5 4 3
## 48 4 5 5 2
## Baggage.handling Checkin.service Inflight.service Cleanliness
## 17 3 4 3 2
## 33 3 1 4 5
## 48 4 1 5 2
## Departure.Delay.in.Minutes Arrival.Delay.in.Minutes satisfaction
## 17 2 0 1
## 33 0 0 1
## 48 4 2 1
flight_clean_standardized<-as.data.frame(scale(flight_clean[1:23])) #normalizing data set considering there are multiple different measurements included in the data.
flight_clean__test_standardized<-as.data.frame(scale(flight_clean_test[1:23]))
flight_clean_PAM_standardized<-as.data.frame(scale(flight_clean_PAM[6:22]))
distance <- get_dist(flight_clean_standardized, method = "euclidean")
fviz_dist(distance, gradient = list(low = "#00AFBB", mid = "white", high = "#FC4E07"))
k2 <- kmeans(flight_clean_standardized, centers = 2, nstart=10, iter.max=10)
k3 <- kmeans(flight_clean_standardized, centers = 3, nstart = 25, iter.max = 10)
k4 <- kmeans(flight_clean_standardized, centers = 4, nstart = 25, iter.max = 10)
k5 <- kmeans(flight_clean_standardized, centers = 5, nstart = 25, iter.max = 10)
k10 <- kmeans(flight_clean_standardized, centers = 10, nstart = 25, iter.max = 10)
# plots to compare
p2 <- fviz_cluster(k2, geom = "point", data = flight_clean_standardized) + ggtitle("k = 2")
p3 <- fviz_cluster(k3, geom = "point", data = flight_clean_standardized) + ggtitle("k = 3")
p4 <- fviz_cluster(k4, geom = "point", data = flight_clean_standardized) + ggtitle("k = 4")
p5 <- fviz_cluster(k5, geom = "point", data = flight_clean_standardized) + ggtitle("k = 5")
p10 <- fviz_cluster(k10, geom = "point", data = flight_clean_standardized) + ggtitle("k = 10")
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:randomForest':
##
## combine
## The following object is masked from 'package:dplyr':
##
## combine
grid.arrange(p2, p3, p4, p5, p10, nrow = 3)
my1<-fviz_nbclust(flight_clean_standardized, kmeans, method="wss")
my2<-fviz_nbclust(flight_clean_standardized, kmeans, method = "silhouette")
grid.arrange(my1, my2, nrow = 2)
gap_stat <- clusGap(flight_clean_standardized, FUN = kmeans, nstart = 10,
K.max = 10, B = 50)
## Warning: Quick-TRANSfer stage steps exceeded maximum (= 156850)
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: Quick-TRANSfer stage steps exceeded maximum (= 156850)
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
fviz_gap_stat(gap_stat)
kmeans_perf2 = function(data,maxc,ns)
{
result = as.data.frame(matrix(ncol=3, nrow=maxc-1))
colnames(result) = c("clusters", "rsq","silhouette")
dst <- daisy(data)
for(i in 2:maxc) {
cst <- kmeans(data,i,iter.max=100,nstart=ns)
rsq <- 1-cst$tot.withinss/(cst$totss)
slht <- silhouette(cst$cluster,dst)
result[i-1,]=c(i,rsq,mean(slht[,3]))
}
ggplot(result, aes(clusters)) +
geom_line(aes(y = rsq, colour = "rsq")) +
geom_line(aes(y = silhouette, colour = "silhouette"))
}
kmeans_perf2(flight_clean_standardized,15,ns=10)
## Warning in daisy(data): binary variable(s) 1, 2, 4, 23 treated as interval
## scaled
set.seed(1)
final <- kmeans(flight_clean_standardized, 2, nstart = 25, iter.max = 10)
#print(final)
fviz_cluster(final, geom = "point",data = flight_clean_standardized)
flight_clean_standardized %>%
mutate(Cluster = final$cluster) %>%
group_by(Cluster) %>%
summarise_all("mean")
## # A tibble: 2 × 24
## Cluster Gender Customer.Type Age Type.of.Travel Class Flight.Distance
## <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 -0.0212 -0.0994 -0.117 0.327 0.401 -0.261
## 2 2 0.0215 0.101 0.118 -0.331 -0.406 0.265
## # … with 17 more variables: Inflight.wifi.service <dbl>,
## # Departure.Arrival.time.convenient <dbl>, Ease.of.Online.booking <dbl>,
## # Gate.location <dbl>, Food.and.drink <dbl>, Online.boarding <dbl>,
## # Seat.comfort <dbl>, Inflight.entertainment <dbl>, On.board.service <dbl>,
## # Leg.room.service <dbl>, Baggage.handling <dbl>, Checkin.service <dbl>,
## # Inflight.service <dbl>, Cleanliness <dbl>,
## # Departure.Delay.in.Minutes <dbl>, Arrival.Delay.in.Minutes <dbl>, …
flight_clean_standardized$Cluster<-as.factor(final$cluster)
flight_clean$Cluster<-as.factor(final$cluster) #adding as a factor variable to the original data set so you can use it for supervised learning.
flight_clean %>%
group_by(Cluster) %>%
summarise_all("mean")
## # A tibble: 2 × 24
## Cluster Gender Customer.Type Age Type.of.Travel Class Flight.Distance
## <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 1.49 1.78 37.5 1.48 1.85 920.
## 2 2 1.51 1.86 41.1 1.17 1.35 1446.
## # … with 17 more variables: Inflight.wifi.service <dbl>,
## # Departure.Arrival.time.convenient <dbl>, Ease.of.Online.booking <dbl>,
## # Gate.location <dbl>, Food.and.drink <dbl>, Online.boarding <dbl>,
## # Seat.comfort <dbl>, Inflight.entertainment <dbl>, On.board.service <dbl>,
## # Leg.room.service <dbl>, Baggage.handling <dbl>, Checkin.service <dbl>,
## # Inflight.service <dbl>, Cleanliness <dbl>,
## # Departure.Delay.in.Minutes <dbl>, Arrival.Delay.in.Minutes <dbl>, …
k_results <- flight_clean %>%
mutate(cluster = final$clustering) %>%
group_by(Cluster) %>%
do(the_summary = summary(.))
k_results$the_summary
## [[1]]
## Gender Customer.Type Age Type.of.Travel Class
## Min. :1.000 Min. :1.000 Min. : 7.00 Min. :1.000 Min. :1.00
## 1st Qu.:1.000 1st Qu.:2.000 1st Qu.:25.00 1st Qu.:1.000 1st Qu.:1.00
## Median :1.000 Median :2.000 Median :37.00 Median :1.000 Median :2.00
## Mean :1.488 Mean :1.778 Mean :37.51 Mean :1.475 Mean :1.85
## 3rd Qu.:2.000 3rd Qu.:2.000 3rd Qu.:49.00 3rd Qu.:2.000 3rd Qu.:2.00
## Max. :2.000 Max. :2.000 Max. :80.00 Max. :2.000 Max. :3.00
## Flight.Distance Inflight.wifi.service Departure.Arrival.time.convenient
## Min. : 67.0 Min. :0.00 Min. :0.000
## 1st Qu.: 368.0 1st Qu.:2.00 1st Qu.:2.000
## Median : 666.0 Median :2.00 Median :3.000
## Mean : 919.7 Mean :2.33 Mean :3.079
## 3rd Qu.:1107.0 3rd Qu.:3.00 3rd Qu.:4.000
## Max. :4243.0 Max. :5.00 Max. :5.000
## Ease.of.Online.booking Gate.location Food.and.drink Online.boarding
## Min. :0.000 Min. :1.000 Min. :1.000 Min. :0.000
## 1st Qu.:2.000 1st Qu.:2.000 1st Qu.:2.000 1st Qu.:2.000
## Median :2.000 Median :3.000 Median :2.000 Median :3.000
## Mean :2.503 Mean :2.944 Mean :2.558 Mean :2.587
## 3rd Qu.:3.000 3rd Qu.:4.000 3rd Qu.:3.000 3rd Qu.:3.000
## Max. :5.000 Max. :5.000 Max. :5.000 Max. :5.000
## Seat.comfort Inflight.entertainment On.board.service Leg.room.service
## Min. :1.000 Min. :1.000 Min. :1.00 Min. :0.00
## 1st Qu.:2.000 1st Qu.:2.000 1st Qu.:2.00 1st Qu.:2.00
## Median :3.000 Median :2.000 Median :3.00 Median :3.00
## Mean :2.694 Mean :2.437 Mean :2.89 Mean :2.88
## 3rd Qu.:4.000 3rd Qu.:3.000 3rd Qu.:4.00 3rd Qu.:4.00
## Max. :5.000 Max. :5.000 Max. :5.00 Max. :5.00
## Baggage.handling Checkin.service Inflight.service Cleanliness
## Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:2.000 1st Qu.:2.000 1st Qu.:2.000 1st Qu.:2.000
## Median :3.000 Median :3.000 Median :3.000 Median :2.000
## Mean :3.206 Mean :2.967 Mean :3.267 Mean :2.529
## 3rd Qu.:4.000 3rd Qu.:4.000 3rd Qu.:4.000 3rd Qu.:3.000
## Max. :5.000 Max. :5.000 Max. :5.000 Max. :5.000
## Departure.Delay.in.Minutes Arrival.Delay.in.Minutes satisfaction Cluster
## Min. : 0.000 Min. : 0.000 Min. :1.000 1:1578
## 1st Qu.: 0.000 1st Qu.: 0.000 1st Qu.:1.000 2: 0
## Median : 0.000 Median : 0.000 Median :1.000
## Mean : 3.523 Mean : 2.963 Mean :1.115
## 3rd Qu.: 4.000 3rd Qu.: 4.000 3rd Qu.:1.000
## Max. :41.000 Max. :20.000 Max. :2.000
##
## [[2]]
## Gender Customer.Type Age Type.of.Travel Class
## Min. :1.000 Min. :1.000 Min. : 7.00 Min. :1.000 Min. :1.00
## 1st Qu.:1.000 1st Qu.:2.000 1st Qu.:30.00 1st Qu.:1.000 1st Qu.:1.00
## Median :2.000 Median :2.000 Median :42.00 Median :1.000 Median :1.00
## Mean :1.509 Mean :1.855 Mean :41.09 Mean :1.168 Mean :1.35
## 3rd Qu.:2.000 3rd Qu.:2.000 3rd Qu.:52.00 3rd Qu.:1.000 3rd Qu.:2.00
## Max. :2.000 Max. :2.000 Max. :80.00 Max. :2.000 Max. :3.00
## Flight.Distance Inflight.wifi.service Departure.Arrival.time.convenient
## Min. : 31 Min. :0.000 Min. :0.000
## 1st Qu.: 489 1st Qu.:2.000 1st Qu.:2.000
## Median :1085 Median :3.000 Median :3.000
## Mean :1446 Mean :3.186 Mean :3.128
## 3rd Qu.:2264 3rd Qu.:4.000 3rd Qu.:4.000
## Max. :4983 Max. :5.000 Max. :5.000
## Ease.of.Online.booking Gate.location Food.and.drink Online.boarding
## Min. :0.000 Min. :1.000 Min. :1.000 Min. :0.000
## 1st Qu.:2.000 1st Qu.:2.000 1st Qu.:3.000 1st Qu.:4.000
## Median :3.000 Median :3.000 Median :4.000 Median :4.000
## Mean :3.097 Mean :3.037 Mean :3.875 Mean :3.963
## 3rd Qu.:4.000 3rd Qu.:4.000 3rd Qu.:5.000 3rd Qu.:5.000
## Max. :5.000 Max. :5.000 Max. :5.000 Max. :5.000
## Seat.comfort Inflight.entertainment On.board.service Leg.room.service
## Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:4.000 1st Qu.:4.000 1st Qu.:3.000 1st Qu.:3.000
## Median :4.000 Median :4.000 Median :4.000 Median :4.000
## Mean :4.219 Mean :4.314 Mean :3.917 Mean :3.832
## 3rd Qu.:5.000 3rd Qu.:5.000 3rd Qu.:5.000 3rd Qu.:5.000
## Max. :5.000 Max. :5.000 Max. :5.000 Max. :5.000
## Baggage.handling Checkin.service Inflight.service Cleanliness
## Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:4.000 1st Qu.:3.000 1st Qu.:4.000 1st Qu.:4.000
## Median :4.000 Median :4.000 Median :4.000 Median :4.000
## Mean :4.117 Mean :3.702 Mean :4.128 Mean :4.091
## 3rd Qu.:5.000 3rd Qu.:5.000 3rd Qu.:5.000 3rd Qu.:5.000
## Max. :5.000 Max. :5.000 Max. :5.000 Max. :5.000
## Departure.Delay.in.Minutes Arrival.Delay.in.Minutes satisfaction Cluster
## Min. : 0.000 Min. : 0.000 Min. :1.000 1: 0
## 1st Qu.: 0.000 1st Qu.: 0.000 1st Qu.:2.000 2:1559
## Median : 0.000 Median : 0.000 Median :2.000
## Mean : 2.836 Mean : 2.261 Mean :1.758
## 3rd Qu.: 3.000 3rd Qu.: 1.000 3rd Qu.:2.000
## Max. :38.000 Max. :20.000 Max. :2.000
flight_clean %>%
group_by(Cluster) %>%
summarise_all("median")
## # A tibble: 2 × 24
## Cluster Gender Customer.Type Age Type.of.Travel Class Flight.Distance
## <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 1 2 37 1 2 666
## 2 2 2 2 42 1 1 1085
## # … with 17 more variables: Inflight.wifi.service <dbl>,
## # Departure.Arrival.time.convenient <dbl>, Ease.of.Online.booking <dbl>,
## # Gate.location <dbl>, Food.and.drink <dbl>, Online.boarding <dbl>,
## # Seat.comfort <dbl>, Inflight.entertainment <dbl>, On.board.service <dbl>,
## # Leg.room.service <dbl>, Baggage.handling <dbl>, Checkin.service <dbl>,
## # Inflight.service <dbl>, Cleanliness <dbl>,
## # Departure.Delay.in.Minutes <dbl>, Arrival.Delay.in.Minutes <dbl>, …
flight_clean %>%
arrange(desc(Cluster)) %>%
ggparcoord(
columns = 1:23, groupColumn = "Cluster", order = "anyClass",
showPoints = TRUE,
title = "Original",
alphaLines = 1
) +
scale_color_manual(values=c( "#69b3a2", "#E8E8E8", "#E8E8E8") ) +
theme(
legend.position="Default",
plot.title = element_text(size=10)
) +
xlab("")+
theme(axis.text.x = element_text(angle = 90))
myclustergraph <- ggparcoord(data = flight_clean, columns = c(1:23), groupColumn = "Cluster", scale = "std") + labs(x = "Flight Variables", y = "value (in standard-deviation units)",par(las=2), title = "Clustering")+
theme(axis.text.x = element_text(angle = 90))
ggplotly(myclustergraph)
ddataCat <- flight_clean %>%
mutate(cluster = final$cluster)
#print()
DemoClusterJoin <- data.frame(flight_clean$satisfaction, ddataCat$cluster)
table00<-as.matrix(table(DemoClusterJoin))
table01<-100*prop.table(table00,2)
print(table01) #displaying that clearly the majority of satisfied customers fall in the second cluster.
## ddataCat.cluster
## flight_clean.satisfaction 1 2
## 1 88.52978 24.18217
## 2 11.47022 75.81783
set.seed(1)
final_test <- kmeans(flight_clean__test_standardized, 2, nstart = 25, iter.max = 10)
#print(final_test)
fviz_cluster(final_test, geom = "point",data = flight_clean__test_standardized)
flight_clean__test_standardized %>%
mutate(Cluster = final_test$cluster) %>%
group_by(Cluster) %>%
summarise_all("mean")
## # A tibble: 2 × 24
## Cluster Gender Customer.Type Age Type.of.Travel Class Flight.Distance
## <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 -0.00755 0.163 0.168 -0.396 -0.499 0.311
## 2 2 0.00646 -0.140 -0.144 0.339 0.427 -0.267
## # … with 17 more variables: Inflight.wifi.service <dbl>,
## # Departure.Arrival.time.convenient <dbl>, Ease.of.Online.booking <dbl>,
## # Gate.location <dbl>, Food.and.drink <dbl>, Online.boarding <dbl>,
## # Seat.comfort <dbl>, Inflight.entertainment <dbl>, On.board.service <dbl>,
## # Leg.room.service <dbl>, Baggage.handling <dbl>, Checkin.service <dbl>,
## # Inflight.service <dbl>, Cleanliness <dbl>,
## # Departure.Delay.in.Minutes <dbl>, Arrival.Delay.in.Minutes <dbl>, …
flight_clean_test$Cluster<-as.factor(final_test$cluster) #adding as a factor variable to the original data set so you can use it for supervised learning.
flight_clean %>%
group_by(Cluster) %>%
summarise_all("mean")
## # A tibble: 2 × 24
## Cluster Gender Customer.Type Age Type.of.Travel Class Flight.Distance
## <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 1.49 1.78 37.5 1.48 1.85 920.
## 2 2 1.51 1.86 41.1 1.17 1.35 1446.
## # … with 17 more variables: Inflight.wifi.service <dbl>,
## # Departure.Arrival.time.convenient <dbl>, Ease.of.Online.booking <dbl>,
## # Gate.location <dbl>, Food.and.drink <dbl>, Online.boarding <dbl>,
## # Seat.comfort <dbl>, Inflight.entertainment <dbl>, On.board.service <dbl>,
## # Leg.room.service <dbl>, Baggage.handling <dbl>, Checkin.service <dbl>,
## # Inflight.service <dbl>, Cleanliness <dbl>,
## # Departure.Delay.in.Minutes <dbl>, Arrival.Delay.in.Minutes <dbl>, …
gower_dist<-daisy(flight_clean_PAM_standardized,metric="gower")
fviz_dist(gower_dist, gradient = list(low = "#00AFBB", mid = "white", high = "#FC4E07"))
fviz_nbclust(flight_clean_standardized[,-24], FUN = pam, method = "silhouette")
fviz_nbclust(flight_clean_standardized[,-24], FUN = pam, method = "wss")
k<-2
pam_fit <- pam(gower_dist, diss = TRUE, k)
flight_clean_PAM$Cluster<-as.factor(pam_fit$cluster)
pam_results <- flight_clean_PAM %>%
mutate(cluster = pam_fit$clustering) %>%
group_by(cluster) %>%
do(the_summary = summary(.))
pam_results$the_summary
## [[1]]
## Gender Customer.Type Age Type.of.Travel
## Female:565 disloyal Customer:232 Min. : 7.0 Business travel:646
## Male :556 Loyal Customer :889 1st Qu.:25.0 Personal Travel:475
## Median :38.0
## Mean :38.2
## 3rd Qu.:51.0
## Max. :85.0
## Class Flight.Distance Inflight.wifi.service
## Business:375 Min. : 67.0 Min. :0.000
## Eco :617 1st Qu.: 384.0 1st Qu.:2.000
## Eco Plus:129 Median : 696.0 Median :2.000
## Mean : 969.6 Mean :2.401
## 3rd Qu.:1235.0 3rd Qu.:3.000
## Max. :3998.0 Max. :5.000
## Departure.Arrival.time.convenient Ease.of.Online.booking Gate.location
## Min. :0.000 Min. :0.000 Min. :1.000
## 1st Qu.:2.000 1st Qu.:2.000 1st Qu.:2.000
## Median :3.000 Median :3.000 Median :3.000
## Mean :3.056 Mean :2.664 Mean :3.003
## 3rd Qu.:4.000 3rd Qu.:4.000 3rd Qu.:4.000
## Max. :5.000 Max. :5.000 Max. :5.000
## Food.and.drink Online.boarding Seat.comfort Inflight.entertainment
## Min. :0.000 Min. :0.000 Min. :1.000 Min. :1.000
## 1st Qu.:1.000 1st Qu.:2.000 1st Qu.:1.000 1st Qu.:1.000
## Median :2.000 Median :3.000 Median :2.000 Median :2.000
## Mean :2.235 Mean :2.759 Mean :2.354 Mean :1.951
## 3rd Qu.:3.000 3rd Qu.:4.000 3rd Qu.:3.000 3rd Qu.:2.000
## Max. :5.000 Max. :5.000 Max. :5.000 Max. :5.000
## On.board.service Leg.room.service Baggage.handling Checkin.service
## Min. :1.000 Min. :0.000 Min. :1.00 Min. :1.000
## 1st Qu.:2.000 1st Qu.:2.000 1st Qu.:2.00 1st Qu.:2.000
## Median :3.000 Median :3.000 Median :3.00 Median :3.000
## Mean :2.722 Mean :2.797 Mean :3.12 Mean :2.968
## 3rd Qu.:4.000 3rd Qu.:4.000 3rd Qu.:4.00 3rd Qu.:4.000
## Max. :5.000 Max. :5.000 Max. :5.00 Max. :5.000
## Inflight.service Cleanliness Departure.Delay.in.Minutes
## Min. :1.000 Min. :1.00 Min. : 0.000
## 1st Qu.:2.000 1st Qu.:1.00 1st Qu.: 0.000
## Median :3.000 Median :2.00 Median : 0.000
## Mean :3.169 Mean :2.16 Mean : 3.277
## 3rd Qu.:4.000 3rd Qu.:3.00 3rd Qu.: 3.000
## Max. :5.000 Max. :5.00 Max. :39.000
## Arrival.Delay.in.Minutes satisfaction Cluster cluster
## Min. : 0.00 neutral or dissatisfied:900 1:1121 Min. :1
## 1st Qu.: 0.00 satisfied :221 2: 0 1st Qu.:1
## Median : 0.00 Median :1
## Mean : 2.89 Mean :1
## 3rd Qu.: 4.00 3rd Qu.:1
## Max. :20.00 Max. :1
##
## [[2]]
## Gender Customer.Type Age Type.of.Travel
## Female:1020 disloyal Customer: 332 Min. : 7.00 Business travel:1494
## Male : 997 Loyal Customer :1685 1st Qu.:29.00 Personal Travel: 523
## Median :41.00
## Mean :40.18
## 3rd Qu.:51.00
## Max. :85.00
## Class Flight.Distance Inflight.wifi.service
## Business:1159 Min. : 31 Min. :0.000
## Eco : 737 1st Qu.: 453 1st Qu.:2.000
## Eco Plus: 121 Median : 937 Median :3.000
## Mean :1312 Mean :2.917
## 3rd Qu.:2062 3rd Qu.:4.000
## Max. :4963 Max. :5.000
## Departure.Arrival.time.convenient Ease.of.Online.booking Gate.location
## Min. :0.000 Min. :0.000 Min. :1.000
## 1st Qu.:2.000 1st Qu.:2.000 1st Qu.:2.000
## Median :3.000 Median :3.000 Median :3.000
## Mean :3.046 Mean :2.813 Mean :2.963
## 3rd Qu.:4.000 3rd Qu.:4.000 3rd Qu.:4.000
## Max. :5.000 Max. :5.000 Max. :5.000
## Food.and.drink Online.boarding Seat.comfort Inflight.entertainment
## Min. :1.000 Min. :0.000 Min. :1.000 Min. :1.000
## 1st Qu.:3.000 1st Qu.:3.000 1st Qu.:4.000 1st Qu.:4.000
## Median :4.000 Median :4.000 Median :4.000 Median :4.000
## Mean :3.784 Mean :3.537 Mean :4.064 Mean :4.206
## 3rd Qu.:5.000 3rd Qu.:5.000 3rd Qu.:5.000 3rd Qu.:5.000
## Max. :5.000 Max. :5.000 Max. :5.000 Max. :5.000
## On.board.service Leg.room.service Baggage.handling Checkin.service
## Min. :1.000 Min. :0.000 Min. :1.000 Min. :1.000
## 1st Qu.:3.000 1st Qu.:3.000 1st Qu.:4.000 1st Qu.:3.000
## Median :4.000 Median :4.000 Median :4.000 Median :4.000
## Mean :3.804 Mean :3.674 Mean :3.971 Mean :3.517
## 3rd Qu.:5.000 3rd Qu.:5.000 3rd Qu.:5.000 3rd Qu.:5.000
## Max. :5.000 Max. :5.000 Max. :5.000 Max. :5.000
## Inflight.service Cleanliness Departure.Delay.in.Minutes
## Min. :1.000 Min. :1.000 Min. : 0.000
## 1st Qu.:4.000 1st Qu.:3.000 1st Qu.: 0.000
## Median :4.000 Median :4.000 Median : 0.000
## Mean :3.986 Mean :3.953 Mean : 2.798
## 3rd Qu.:5.000 3rd Qu.:5.000 3rd Qu.: 2.000
## Max. :5.000 Max. :5.000 Max. :39.000
## Arrival.Delay.in.Minutes satisfaction Cluster cluster
## Min. : 0.000 neutral or dissatisfied: 806 1: 0 Min. :2
## 1st Qu.: 0.000 satisfied :1211 2:2017 1st Qu.:2
## Median : 0.000 Median :2
## Mean : 2.339 Mean :2
## 3rd Qu.: 2.000 3rd Qu.:2
## Max. :20.000 Max. :2
Again, very similar results compared to the k-means cluster test and k-means clusters.
tsne_obj <- Rtsne(gower_dist, is_distance = TRUE)
tsne_data <- tsne_obj$Y %>%
data.frame() %>%
setNames(c("X", "Y")) %>%
mutate(cluster = factor(pam_fit$clustering),
name = flight_clean_PAM_standardized$satisfaction)
ggplot(aes(x = X, y = Y), data = tsne_data) +
geom_point(aes(color = cluster))
ddataCat2 <- flight_clean_PAM %>%
mutate(cluster = pam_fit$cluster)
#print()
DemoClusterJoin1 <- data.frame(flight_clean_PAM$satisfaction, ddataCat2$cluster)
table03<-as.matrix(table(DemoClusterJoin1))
table04<-100*prop.table(table03,2)
print(table04)
## ddataCat2.cluster
## flight_clean_PAM.satisfaction 1 2
## neutral or dissatisfied 80.28546 39.96034
## satisfied 19.71454 60.03966
set.seed(2)
smaller.index <-createDataPartition(flight_clean_standardized$satisfaction, p = 0.35, list = FALSE)
#Reducing the partition of the already partitioned data due to processing issues from hierarchical clustering
flight_clean_h <- flight_cluster[smaller.index, ]
colSums(is.na(flight_clean_h))
## X id
## 0 0
## Gender Customer.Type
## 0 0
## Age Type.of.Travel
## 0 0
## Class Flight.Distance
## 0 0
## Inflight.wifi.service Departure.Arrival.time.convenient
## 0 0
## Ease.of.Online.booking Gate.location
## 0 0
## Food.and.drink Online.boarding
## 0 0
## Seat.comfort Inflight.entertainment
## 0 0
## On.board.service Leg.room.service
## 0 0
## Baggage.handling Checkin.service
## 0 0
## Inflight.service Cleanliness
## 0 0
## Departure.Delay.in.Minutes Arrival.Delay.in.Minutes
## 0 0
## satisfaction
## 0
flight_clean_h <- na.omit(flight_clean_h)
methods <- c( "average", "single", "complete", "ward")
names(methods) <- c( "average", "single", "complete", "ward")
#
ac <- function(x) {
agnes(flight_clean_h, method = x)$ac
}
map_dbl(methods, ac)
## average single complete ward
## 0.9887540 0.7107700 0.9942590 0.9994965
Ward gives best results.
hier_cluster_for_flight <- agnes(flight_clean_h, method = "ward")
pltree(hier_cluster_for_flight, cex = 0.6, hang = -1, main = "Dendrogram of agnes")
fviz_nbclust(flight_clean_h, FUN = hcut, method = "wss")
fviz_nbclust(flight_clean_h, FUN = hcut, method = "silhouette")
hier_flight_cluster <- cutree(hier_cluster_for_flight, k = 2)
table(hier_flight_cluster)
## hier_flight_cluster
## 1 2
## 658 440
aggregate(flight_clean_h[,-c(1,1)],list(hier_flight_cluster),mean)
## Group.1 id Gender Customer.Type Age Type.of.Travel Class
## 1 1 39688.85 1.500000 1.826748 39.91337 1.297872 1.615502
## 2 2 104149.08 1.527273 1.809091 40.90000 1.306818 1.513636
## Flight.Distance Inflight.wifi.service Departure.Arrival.time.convenient
## 1 1205.764 2.822188 3.118541
## 2 1291.159 2.779545 3.211364
## Ease.of.Online.booking Gate.location Food.and.drink Online.boarding
## 1 2.761398 2.962006 3.24924 3.284195
## 2 2.893182 3.034091 3.20000 3.322727
## Seat.comfort Inflight.entertainment On.board.service Leg.room.service
## 1 3.483283 3.430091 3.348024 3.354103
## 2 3.484091 3.331818 3.436364 3.481818
## Baggage.handling Checkin.service Inflight.service Cleanliness
## 1 3.559271 3.229483 3.711246 3.303951
## 2 3.704545 3.481818 3.759091 3.327273
## Departure.Delay.in.Minutes Arrival.Delay.in.Minutes satisfaction
## 1 2.319149 2.246201 1.462006
## 2 3.843182 2.695455 1.450000